In [None]:
import pandas as pd
from pathlib import Path
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

def createDataFrameTemplate() -> pd.DataFrame:
    columns = {
        'id': pd.Series(dtype='int'),
        'title': pd.Series(dtype='str'),
        'content': pd.Series(dtype='str'),
        'review': pd.Series(dtype='str'),
        'publish_date': pd.Series(dtype='str'),
        'isReal': pd.Series(dtype='bool')
    }
    return pd.DataFrame(columns)

def returnDataFrame(data_frame, isReal) -> pd.DataFrame:
    new_data = createDataFrameTemplate()
    count = 0

    for arr in data_frame.values:

            count +=1
            new_row = {
                'id': count,
                'title': str(arr[0]).lstrip(),
                'content': arr[1],
                'review': arr[2],
                'publish_date': arr[3],
                'isReal': isReal
            }
            new_data = pd.concat([pd.DataFrame(new_row, index=[new_row['id']]), new_data])
    return new_data

def ReturnDataSets():
    fake_file_path = Path("Fake.csv")
    true_file_path = Path("True.csv")

    fake = pd.read_csv(fake_file_path)
    tr = pd.read_csv(true_file_path)

    fake_data = returnDataFrame(fake,False)
    true_data = returnDataFrame(tr, True)
    return fake_data,true_data

fake_data, true_data = ReturnDataSets()

FullSet = pd.concat([fake_data, true_data])

vectorizer = TfidfVectorizer(stop_words='english')

X_train, X_test, y_train, y_test = train_test_split(FullSet['content'], 
                                                    FullSet['isReal'], 
                                                    test_size=0.2, 
                                                    random_state=42, 
                                                    stratify=FullSet['isReal'])

X_train_tfidf = vectorizer.fit_transform(X_train)
X_Test_tfidf = vectorizer.transform(X_test)

param_grid = [x for x in range(1,20)]

best_acc = 0
best_report = None
best_knn = 0
for x in param_grid:

    print("Neighbors :", x)
    knn = KNeighborsClassifier(n_neighbors=x, metric='cosine')
    knn.fit(X_train_tfidf, y_train)

    y_pred = knn.predict(X_Test_tfidf)

    curr_acc = accuracy_score(y_test, y_pred)
    print("Accuracy:", curr_acc)
    
    report = classification_report(y_test, y_pred, target_names=['True', 'Fake'])
    print(report)

    best_acc = max(best_acc, curr_acc)
    if best_acc == curr_acc:
         best_report, best_knn = report, x
         
print("Best KNN Value: ", best_knn)
print("Best Accuracy: ", best_acc)
print("Best report: " , best_report)
    











Neighbors : 1
Accuracy: 0.8433862433862434
              precision    recall  f1-score   support

        True       0.91      0.76      0.83      1896
        Fake       0.79      0.93      0.86      1884

    accuracy                           0.84      3780
   macro avg       0.85      0.84      0.84      3780
weighted avg       0.85      0.84      0.84      3780

Neighbors : 2
Accuracy: 0.8624338624338624
              precision    recall  f1-score   support

        True       0.86      0.87      0.86      1896
        Fake       0.87      0.85      0.86      1884

    accuracy                           0.86      3780
   macro avg       0.86      0.86      0.86      3780
weighted avg       0.86      0.86      0.86      3780

Neighbors : 3
Accuracy: 0.8468253968253968
              precision    recall  f1-score   support

        True       0.92      0.77      0.83      1896
        Fake       0.80      0.93      0.86      1884

    accuracy                           0.85      3780