In [41]:
import pandas as pd

# Yeni veri setini yükleme
new_data_path = 'data/raw/1000Folk_Story_around_the_Globe.csv'
df = pd.read_csv(new_data_path)


In [42]:
df.columns

Index(['genre', 'source', 'region', 'title', 'full_text'], dtype='object')

In [19]:
df.groupby(['source', 'region']).size()


source                     region        
African folktales          Nigeria            40
                           South Africa       15
Andersen fairy tales       Denmark            50
Andrew Lang fairy tales    Scotland          116
Arab folktales             Arab               24
Asian folktales            China              80
                           India              33
                           Japan              67
Australian folktales       Australia          31
European folktales         British isles       6
                           Celtic             26
                           Czech              35
                           Dutch              21
                           England            43
                           Germany            38
                           Ireland            38
                           Italy              30
                           Norway             15
                           Poland              7
                           

In [20]:
df["source"] = df["source"].str.strip().str.lower()
df.loc[df["source"] == 'indian  folktales', "source"] = 'indianfolktales'
df["source"] = df["source"].str.replace(" ", "")


In [21]:
df.loc[(df['source'] == 'asianfolktales') & (df['region'] == 'India'), 'source'] = 'indianfolktales'


In [22]:
df.groupby(['source', 'region']).size()


source                   region        
africanfolktales         Nigeria            40
                         South Africa       15
andersenfairytales       Denmark            50
andrewlangfairytales     Scotland          116
arabfolktales            Arab               24
asianfolktales           China              80
                         Japan              67
australianfolktales      Australia          31
europeanfolktales        British isles       6
                         Celtic             26
                         Czech              35
                         Dutch              21
                         England            43
                         Germany            38
                         Ireland            38
                         Italy              30
                         Norway             15
                         Poland              7
                         Portugal           34
                         Romania            18
                    

In [23]:
df.columns

Index(['genre', 'source', 'region', 'title', 'full_text'], dtype='object')

In [24]:
df.rename(columns={'full_text': 'body', 'source': 'category'}, inplace=True)


In [35]:
import pandas as pd
import numpy as np
import re
import emoji
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import RidgeClassifier
from sklearn.model_selection import train_test_split

# Make sure to download these resources if not already done
nltk.download('stopwords')
nltk.download('wordnet')

# Define custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(f"Selecting column: {self.column}")
        return X[self.column]

# Text preprocessor
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print("Starting text preprocessing...")
        X['title_cleaned'] = X['title'].apply(self.clean_text).apply(self.lemmatize_and_remove_stopwords)
        X['body_cleaned'] = X['body'].apply(self.clean_text).apply(self.lemmatize_and_remove_stopwords)
        print("Text preprocessing completed.")
        return X

    def clean_text(self, text):
        if pd.isnull(text):
            return text
        text = text.lower()
        text = re.sub(r'http://\S+|https://\S+|www\.\S+', '', text, flags=re.MULTILINE)
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def lemmatize_and_remove_stopwords(self, text):
        if pd.isna(text):
            return ''  # NaN değerler için boş string döndür
        if text == "":
            return text
        words = text.split()
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(lemmatized_words)

# Custom transformer for body TF-IDF calculation with weight
class BodyTfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, weight=1.45):
        self.weight = weight
        self.tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.6, min_df=2, ngram_range=(1, 1))

    def fit(self, X, y=None):
        print("Fitting BodyTfidfTransformer...")
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        print("Transforming body text to TF-IDF features...")
        tfidf_matrix = self.tfidf_vectorizer.transform(X)
        return tfidf_matrix * self.weight

# Custom transformer for title TF-IDF calculation
class TitleTfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.6, min_df=2, ngram_range=(1, 1))

    def fit(self, X, y=None):
        print("Fitting TitleTfidfTransformer...")
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        print("Transforming title text to TF-IDF features...")
        return self.tfidf_vectorizer.transform(X)

# Create a pipeline for processing text data
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('features', FeatureUnion([
        ('title_tfidf', Pipeline([
            ('selector', ColumnSelector('title_cleaned')),
            ('tfidf', TitleTfidfTransformer())
        ])),
        ('body_tfidf', Pipeline([
            ('selector', ColumnSelector('body_cleaned')),
            ('tfidf', BodyTfidfTransformer())
        ]))
    ])),
    ('classifier', RidgeClassifier(alpha=2, tol=0.01, random_state=42))
])

# Example data
data = df

# Split data
X = data[['title', 'body']]
y = data['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the test data
accuracy = pipeline.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bahar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\bahar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Fitting TitleTfidfTransformer...
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Fitting BodyTfidfTransformer...
Transforming body text to TF-IDF features...
Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Transforming body text to TF-IDF features...
Accuracy: 0.85


In [34]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Model tahminlerini yapma
y_pred = pipeline.predict(X_test)

# Performans metriklerini hesaplama
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Detaylı rapor
report = classification_report(y_test, y_pred)
print("Classification Report:")
print(report)


Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Transforming body text to TF-IDF features...
Precision: 0.90
Recall: 0.87
F1 Score: 0.86
Classification Report:
                         precision    recall  f1-score   support

       africanfolktales       1.00      0.94      0.97        16
     andersenfairytales       1.00      0.75      0.86        12
   andrewlangfairytales       0.91      0.40      0.56        25
          arabfolktales       1.00      1.00      1.00         4
         asianfolktales       0.97      0.94      0.95        31
    australianfolktales       1.00      0.78      0.88         9
      europeanfolktales       0.71      0.99      0.82        74
      filipinofolktales       0.88      0.88      0.88         8
        grimmfairytales       0.98      0.98      0.98        44
        indianfolktales       1.00      0.57      0.73         7
na

In [37]:
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import RidgeClassifier
from scipy.sparse import hstack
import pandas as pd
import numpy as np
import re
import emoji
import nltk

# Make sure to download these resources if not already done
nltk.download('stopwords')
nltk.download('wordnet')

# Define custom transformer to select columns
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, column):
        self.column = column

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        print(f"Selecting column: {self.column}")
        return X[self.column]

# Text preprocessor
class TextPreprocessor(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.lemmatizer = WordNetLemmatizer()
        self.stop_words = set(stopwords.words('english'))

    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        print("Starting text preprocessing...")
        X['title_cleaned'] = X['title'].apply(self.clean_text).apply(self.lemmatize_and_remove_stopwords)
        X['body_cleaned'] = X['body'].apply(self.clean_text).apply(self.lemmatize_and_remove_stopwords)
        print("Text preprocessing completed.")
        return X

    def clean_text(self, text):
        if pd.isnull(text):
            return text
        text = text.lower()
        text = re.sub(r'http://\S+|https://\S+|www\.\S+', '', text, flags=re.MULTILINE)
        text = emoji.replace_emoji(text, replace='')
        text = re.sub(r'[^\w\s]', ' ', text)
        text = re.sub(r'\d', ' ', text)
        text = re.sub(r'\n', ' ', text)
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    def lemmatize_and_remove_stopwords(self, text):
        if pd.isna(text):
            return ''  # NaN değerler için boş string döndür
        if text == "":
            return text
        words = text.split()
        lemmatized_words = [self.lemmatizer.lemmatize(word) for word in words if word not in self.stop_words]
        return ' '.join(lemmatized_words)

# Custom transformer for body TF-IDF calculation with weight
class BodyTfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, weight=1.45):
        self.weight = weight
        self.tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.6, min_df=2, ngram_range=(1, 1))

    def fit(self, X, y=None):
        print("Fitting BodyTfidfTransformer...")
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        print("Transforming body text to TF-IDF features...")
        tfidf_matrix = self.tfidf_vectorizer.transform(X)
        return tfidf_matrix * self.weight

# Custom transformer for title TF-IDF calculation
class TitleTfidfTransformer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.tfidf_vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.6, min_df=2, ngram_range=(1, 1))

    def fit(self, X, y=None):
        print("Fitting TitleTfidfTransformer...")
        self.tfidf_vectorizer.fit(X)
        return self

    def transform(self, X):
        print("Transforming title text to TF-IDF features...")
        return self.tfidf_vectorizer.transform(X)

# Create a pipeline for processing text data
pipeline = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('features', FeatureUnion([
        ('title_tfidf', Pipeline([
            ('selector', ColumnSelector('title_cleaned')),
            ('tfidf', TitleTfidfTransformer())
        ])),
        ('body_tfidf', Pipeline([
            ('selector', ColumnSelector('body_cleaned')),
            ('tfidf', BodyTfidfTransformer())
        ]))
    ])),
    ('classifier', RidgeClassifier(alpha=2, tol=0.01, random_state=42))
])
# Split data
print("Splitting data into training and test sets...")
X = df[['title', 'body']]
y = df['category']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print("Data splitting completed.")

# Perform cross-validation on preprocessed data
print("Performing cross-validation...")
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=15)  # 5-fold cross-validation
print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {np.mean(cv_scores)}")

# Train the pipeline
print("Training the pipeline...")
pipeline.fit(X_train, y_train)
print("Pipeline training completed.")

# Predict and evaluate the model
print("Making predictions and evaluating the model...")
y_pred = pipeline.predict(X_test)
print("Prediction and evaluation completed.")

# Output results
accuracy = (y_pred == y_test).mean()
print(f"Model accuracy: {accuracy}")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bahar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\bahar/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Splitting data into training and test sets...
Data splitting completed.
Performing cross-validation...
Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Fitting TitleTfidfTransformer...
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Fitting BodyTfidfTransformer...
Transforming body text to TF-IDF features...
Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Transforming body text to TF-IDF features...
Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Fitting TitleTfidfTransformer...
Transforming title text to TF-IDF features...
Selecting column: body_cleaned
Fitting BodyTfidfTransformer...
Transforming body text to TF-IDF features...
Starting text preprocessing...
Text preprocessing completed.
Selecting column: title_cleaned
Transforming title text to TF