In [None]:
import re
import nltk
import time
import pandas as pd
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectKBest, f_classif, chi2
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
#from sklearn.model_selection import cross_validate
from sklearn.decomposition import PCA
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.base import is_classifier, is_regressor

def train_and_evaluate_regressor(regressor, X_train, y_train, X_test, y_test, method_name):
    start_time = time.time()
    regressor.fit(X_train, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    y_pred = regressor.predict(X_test)
    testing_time = time.time() - start_time

    mae = mean_absolute_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)  # Use squared=False to get RMSE

    result_table = pd.DataFrame({
        'Method': [method_name],
        'MAE': [mae],
        'RMSE': [rmse],
        'Training Time': [training_time],
        'Testing Time': [testing_time]
    })

    return result_table

def train_and_evaluate_classifier(classifier, X_train, y_train, X_test, y_test, method_name):
    start_time = time.time()
    classifier.fit(X_train, y_train)
    training_time = time.time() - start_time

    start_time = time.time()
    y_pred = classifier.predict(X_test)
    testing_time = time.time() - start_time

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    num_features = X_train.shape[1]

    result_table = pd.DataFrame({
        'Method': [method_name],
        'Accuracy': [accuracy],
        'F-measure': [f1],
        'Precision': [precision],
        'Recall': [recall],
        'Number of Features': [num_features],
        'Training Time': [training_time],
        'Testing Time': [testing_time]
    })

    return result_table

def train_and_evaluate_all_models(X_train, y_train, X_test, y_test, vectorizer, coun_vect, pca, kbest, models):
    results = []

    for model_name, model in models.items():
        # TF-IDF vectorizer
        X_train_tfidf = vectorizer.fit_transform(X_train)
        X_test_tfidf = vectorizer.transform(X_test)
        # Count vectorizer for ngram
        X_train_ngram = coun_vect.fit_transform(X_train)
        X_test_ngram = coun_vect.transform(X_test)

        # TF-IDF + FS
        X_train_tfidf_fs = kbest.fit_transform(X_train_tfidf, y_train)
        X_test_tfidf_fs = kbest.transform(X_test_tfidf)

        # Ngram + fs
        X_train_ngram_fs = kbest.fit_transform(X_train_ngram, y_train)
        X_test_ngram_fs = kbest.transform(X_test_ngram)

        # TF-IDF + PCA
        X_train_tfidf_pca = pca.fit_transform(X_train_tfidf.toarray())
        X_test_tfidf_pca = pca.transform(X_test_tfidf.toarray())
        # Ngram + pca
        X_train_ngram_pca = pca.fit_transform(X_train_ngram.toarray())
        X_test_ngram_pca = pca.transform(X_test_ngram.toarray())

        if is_regressor(model):
            if model != GaussianNB:
              result_tfidf_pca = train_and_evaluate_regressor(model, X_train_tfidf_pca, y_train, X_test_tfidf_pca, y_test, f'{model_name} TF-IDF + PCA')
              result_ngram_pca = train_and_evaluate_regressor(model, X_train_ngram_pca, y_train, X_test_ngram_pca, y_test, f'{model_name} N-gram + PCA')

            result_tfidf = train_and_evaluate_regressor(model, X_train_tfidf, y_train, X_test_tfidf, y_test, f'{model_name} TF-IDF')
            result_ngram = train_and_evaluate_regressor(model, X_train_ngram, y_train, X_test_ngram, y_test, f'{model_name} N-gram')
            result_ngram_fs = train_and_evaluate_regressor(model, X_train_ngram_fs, y_train, X_test_ngram_fs, y_test, f'{model_name} N-gram +fs')
            result_tfidf_fs = train_and_evaluate_regressor(model, X_train_tfidf_fs, y_train, X_test_tfidf_fs, y_test, f'{model_name} TF-IDF + FS')
        elif is_classifier(model):
            if model != nb_classifier:
              result_tfidf_pca = train_and_evaluate_classifier(model, X_train_tfidf_pca, y_train, X_test_tfidf_pca, y_test, f'{model_name} TF-IDF + PCA')
              result_ngram_pca = train_and_evaluate_classifier(model, X_train_ngram_pca, y_train, X_test_ngram_pca, y_test, f'{model_name} N-gram + PCA')
            result_tfidf = train_and_evaluate_classifier(model, X_train_tfidf, y_train, X_test_tfidf, y_test, f'{model_name} TF-IDF')
            result_ngram = train_and_evaluate_classifier(model, X_train_ngram, y_train, X_test_ngram, y_test, f'{model_name} N-gram')
            result_ngram_fs = train_and_evaluate_classifier(model, X_train_ngram_fs, y_train, X_test_ngram_fs, y_test, f'{model_name} N-gram +fs')
            result_tfidf_fs = train_and_evaluate_classifier(model, X_train_tfidf_fs, y_train, X_test_tfidf_fs, y_test, f'{model_name} TF-IDF + FS')
        else:
            raise ValueError(f"Unsupported model type: {type(model)}")
        if model != nb_classifier and model != GaussianNB:
          results.extend([result_tfidf, result_ngram, result_ngram_fs, result_tfidf_fs, result_tfidf_pca, result_ngram_pca])
          print(f"naive bayes olmayan kısım model {model} güncel result {results}")
        else:
          results.extend([result_tfidf, result_ngram, result_ngram_fs, result_tfidf_fs])
          print(f"naive bayes içeren kısım model {model} güncel result {results}")

    return results

In [None]:
# stop words
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# stemming
stemmer = PorterStemmer()

# TF-IDF
vectorizer = TfidfVectorizer(max_features=100)

#count vectorizer
coun_vect = CountVectorizer(ngram_range=(1,2), max_features=100)


data = pd.read_csv("//content//fake_news_dataset.csv")

def preprocess(text):
    text = text.lower()
    text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)
    text = text.strip()
    return text

data['preprocessed'] = data['tweet'].apply(preprocess)
data['preprocessed'] = data['preprocessed'].apply(lambda x: ' '.join([word for word in x.split() if word.lower() not in stop_words]))
data['preprocessed'] = data['preprocessed'].apply(lambda x: ' '.join([stemmer.stem(word) for word in x.split()]))


knn_classifier = KNeighborsClassifier()
nb_classifier = MultinomialNB()
dt_classifier = DecisionTreeClassifier()

knn_regressor = KNeighborsRegressor()
dt_regressor = DecisionTreeRegressor()
nb_regressor = GaussianNB()
lr_regressor = LinearRegression()

kbest = SelectKBest(chi2, k = 100)
pca = PCA(n_components = 100)

kf = KFold(n_splits=5, shuffle=True)


knn_results = []
dt_results = []
nb_results = []
knn_results_reg = []
dt_results_reg = []
nb_results_reg = []

X=data['preprocessed']
y=data['target']
y_reg = data['score']
classifiers = {'KNN': knn_classifier, 'Decision Tree': dt_classifier, 'Naive Bayes': nb_classifier}
regressors = {'KNN': knn_regressor, 'Decision Tree': dt_regressor, 'Linear Regression': lr_regressor}

clas_res = []
regres_res = []

for train_i, test_i in kf.split(X, y):
    X_train, X_test = X[train_i], X[test_i]
    y_train, y_test = y[train_i], y[test_i]

    # Train and evaluate classifiers
    classifier_results = train_and_evaluate_all_models(X_train, y_train, X_test, y_test, vectorizer, coun_vect, pca, kbest, classifiers)

    clas_res.append(classifier_results)

    # Train and evaluate regressors
    regressor_results = train_and_evaluate_all_models(X_train, y_reg[train_i], X_test, y_reg[test_i], vectorizer, coun_vect, pca, kbest, regressors)
    regres_res.append(regressor_results)





In [None]:

merged_clas_res = pd.concat([pd.concat(result, ignore_index=True) for result in clas_res], ignore_index=True)

merged_regres_res = pd.concat([pd.concat(result, ignore_index=True) for result in regres_res], ignore_index=True)

avg_clas_res = merged_clas_res.groupby('Method').mean().reset_index()

avg_regres_res = merged_regres_res.groupby('Method').mean().reset_index()

print("Ortalama Sınıflandırma Sonuçları:")
print(avg_clas_res)

print("\nOrtalama Regresyon Sonuçları:")
print(avg_regres_res)
