Team 14: Zachary Almaraz, Caleb Fowler, Jonathan Pierre, Zachary Vasey

Notes

Create a link to shared MachineLearningProject in personal drive.

In [0]:
from google.colab import drive
drive.mount('/content/drive')

%cd /content/drive/My\ Drive/MachineLearningProject/
%ls -la

Inspect data directory.

In [0]:
%ls data -la

Import news.csv dataset.

In [0]:
import pandas as pd
df_original = pd.read_csv('data/news.csv') 
datasets = [ {'name' : 'news.csv',     'df' : df_original}, ]

Look at data.

In [0]:
from IPython.display import display

def inspect(datasets):
  for d in datasets:
    print('-' * 25 + '\n' + d["name"] + '\n' + '-' * 25)
    print('Shape: ', end='')
    print(d["df"].shape)
    print()
    print('Missing values:')
    print(d["df"].isna().sum())
    print()
    print("first 3 samples:")
    display(d["df"].head(n=3))

inspect(datasets)

Drop samples with missing features.

In [0]:
df_original = df_original.dropna()
datasets = [ {'name' : 'news.csv',     'df' : df_original}, ]
inspect( datasets )

Remove Duplicates by Title

In [0]:
len_before = len(df_original['title'])
df_original.drop_duplicates(subset ="title", keep = 'first', inplace = True) 
len_after = len(df_original['title'])
print(f'number of duplicate articles removed: {len_before - len_after}')

Feature Extraction

In [0]:
df = df_original.copy()

new_dimensions = ['fake']

for feature in ('title', 'text'):
  num_chars = []
  num_words = []
  size_words = []
  num_words_uppercase = []  # number of words consisting of all capital letters

  for i in df[feature]:
    temp = []
    num_uppercase = 0
    test = 0
    for word in i.split(' '):
      temp.append(len(word))
      if word.isupper():
        if len(word) > 1:  # not counting "I" or "A" etc
          num_uppercase = num_uppercase + 1

    size_words.append(sum(temp) / len(temp))
    num_words.append(len(i.split(' ')))
    num_chars.append(len(i))
    num_words_uppercase.append(num_uppercase)

  names = ['avg_word_len_'+feature,
           'num_words_'+feature,
           'total_chars_'+feature,
           'words_uppercase_'+feature]

  new_dimensions.extend(names)

  df[names[0]] = size_words
  df[names[1]] = num_words
  df[names[2]] = num_chars
  df[names[3]] = num_words_uppercase

df = df[new_dimensions]

print('done')

Inspect the new dataframe.

In [0]:
inspect(datasets = [{'name' : 'news_modified', 'df' : df},])

Examine the "uppercase words" dimension.

In [0]:
import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

real = df[df.fake == 0]
fake = df[df.fake == 1]

# title feature
a = real.words_uppercase_title
b = fake.words_uppercase_title

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(16,6), sharey=True)
f.suptitle("title: uppercase words", fontsize=20)
n, bins, patches = ax1.hist(a, max(a), facecolor='blue', alpha=0.7)
ax1.set_title('Real News')
ax1.set_ylabel("samples")
ax1.set_xlabel('uppercase words')

ax2.set_title('Fake News')
n, bins, patches = ax2.hist(b, max(b), facecolor='red', alpha=0.7)
ax2.set_xlabel('uppercase words')

plt.show()

# text feature
c = real.words_uppercase_text
d = fake.words_uppercase_text

f, (ax1, ax2) = plt.subplots(1, 2, figsize=(20,5), sharey=True)
f.suptitle("text: uppercase words", fontsize=20)
n, bins, patches = ax1.hist(c, max(c), facecolor='blue', alpha=0.7)
ax1.set_title('Real News')
ax1.set_ylabel("samples")
ax1.set_xlabel('uppercase words')

ax2.set_title('Fake News')
n, bins, patches = ax2.hist(d, max(d), facecolor='red', alpha=0.7)
ax2.set_xlabel('uppercase words')

plt.show()

In [0]:
real = df[df.fake == 0]
fake = df[df.fake == 1]

count_real_news = len(real)
count_fake_news = len(fake)

print(f"total real news articles: {count_real_news}")
print(f"total fake news articles: {count_fake_news}")

real_uppercase_title = []
real_uppercase_text = []
fake_uppercase_title = []
fake_uppercase_text = []

nmax = 7
for n in range(0, nmax):
  # real articles that have at least n all uppercase words
  real_uppercase_title.append( real[real.words_uppercase_title > n] )
  real_uppercase_text.append( real[real.words_uppercase_text > n] ) 

  # fake articles that have at least n all uppercase words
  fake_uppercase_title.append( fake[fake.words_uppercase_title > n] )
  fake_uppercase_text.append( fake[fake.words_uppercase_text > n] )

In [0]:
# Visualize results
import matplotlib.pyplot as plt
import numpy as np
from matplotlib.ticker import MaxNLocator

x = np.arange(0,nmax,1)

a = [len(i) for i in real_uppercase_title]
b = [len(i) for i in fake_uppercase_title]
c = [len(i) for i in real_uppercase_text]
d = [len(i) for i in fake_uppercase_text]

fig = plt.figure(figsize=(14,5))
fig.suptitle('title: uppercase words')

ax = fig.add_subplot(111)  # 121
ax.plot(x, a, 'bs')
ax.plot(x, b, 'rs')
plt.ylabel("total samples")
plt.xlabel("total uppercase words > x")
ax.xaxis.set_major_locator(MaxNLocator(integer=True))

ax.legend(('real news', 'fake news'))

plt.show()

Normalize data function.

In [0]:
def normalize_data(df, realcols):
  for col in realcols:
    df[col] = (df[col] - df[col].mean()) / df[col].std()

Normalize data.

In [0]:
dimensions_to_normalize = new_dimensions.copy()

try:
  dimensions_to_normalize.remove('fake')
except ValueError as e:
  pass

dfn = df.copy()
normalize_data(dfn, dimensions_to_normalize)

Display normalized data.

In [0]:
dfn.sample(5)

Get X and y.

In [0]:
y = dfn['fake']
df_temp = dfn.copy()
df_temp.drop('fake', axis=1, inplace=True)
X = df_temp

Visualize features on subset of samples

In [0]:
import seaborn as sns

np.random.seed(4347)
small_df = df.sample(int(0.0025*len(df)))

print('label size', f'true {(small_df.fake==0).sum()}, fake {(small_df.fake==1).sum()}')
sns.set(style="ticks", color_codes=True)
sns.pairplot(small_df, hue = 'fake', kind = 'scatter', markers=["o", "s"], corner=True)

Train Test Split

Initial Comparison of various classifiers using a K-FOLD = 25

In [0]:
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import  DecisionTreeClassifier, plot_tree
from sklearn.neighbors import KNeighborsClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import KFold, cross_val_score
from sklearn import linear_model

# prepare configuration for cross validation test harness
seed = 7

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('QDA', QuadraticDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier(n_neighbors=41)))
models.append(('Tree', DecisionTreeClassifier()))
models.append(('GNB', GaussianNB()))

# evaluate each model in turn
results = []
names = []
scoring = 'accuracy'
for name, model in models:
	kfold = model_selection.KFold(n_splits=25)
	cv_results = model_selection.cross_val_score(model, X, y, cv=kfold, scoring=scoring)
	results.append(cv_results)
	names.append(name)
	msg = "%s: mean: %f, std: %f" % (name, cv_results.mean(), cv_results.std())
	print(msg)
 
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()

KNN Classifier

In [0]:
from sklearn.model_selection import train_test_split

y = df['fake']
df_temp = df.copy()
df_temp.drop('fake', axis=1, inplace=True)
X = df_temp

def returnScore(k, xtrain, xtest, ytrain, ytest):
  knc = KNeighborsClassifier(n_neighbors=k)
  knc.fit(xtrain, ytrain)
  return knc.score(xtest, ytest)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.05, stratify=y, random_state=42)

result = []
test_k = range(1, 51, 1)
for i in test_k:
  result.append(returnScore(i, X_train, X_test, y_train, y_test))
plt.plot(test_k, result)
plt.title("KNN Score with different K values. test_size=0.05")
plt.ylabel("Score")
plt.xlabel("K")
plt.show()
best_k = result.index(max(result)) + 1
print(f'highest score: {max(result)} with {best_k} k')

Testing KNN with different Test Sizes

In [0]:
#test size 0.01 - 0.1

best_k_list = []
test_sizes = []
for i in range(1, 11):
  test_sizes.append(float(i/100))
print(test_sizes)
for test_size in test_sizes:
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, stratify=y, random_state=42)

  result = []
  test_k = range(1, 101, 2)
  for i in test_k:
    #print(i)
    result.append(returnScore(i, X_train, X_test, y_train, y_test))
  plt.plot(test_k, result)
  plt.title(f'KNN Score with different K values. test_size={test_size}')
  plt.ylabel("Score")
  plt.xlabel("K")
  plt.show()
  best_k = (result.index(max(result)) * 2) + 1
  print(f'highest score: {max(result)} with {best_k} k')
  best_k_list.append(best_k)

plt.scatter(test_sizes, best_k_list)
plt.title(f'Best K with each test_size')
plt.ylabel("Best K")
plt.xlabel("Test Size")
plt.show()
print(best_k_list.sort())

Cross Validation to find optimal k

In [0]:
cv_values = [5, 10]
k_range = []
total_scores = []
for i in range(1, 101):
  k_range.append(i)

for i in k_range:
  knc = KNeighborsClassifier(n_neighbors=i)
  scores_mean = []
  scores_max = []
  scores_min = []
  
  for i in cv_values: 
    scores = cross_val_score(knc, X, y, cv=i)
    scores_mean.append(scores.mean())
    scores_max.append(max(scores))
    scores_min.append(min(scores))
  
  total_scores.append(sum(scores_mean) / len(scores_mean))

plt.figure()
plt.plot(k_range, total_scores, color="blue", linewidth=2)
plt.xlabel("k")
plt.ylabel("score")
plt.title("k vs scores")
plt.legend()
plt.show()

Display best score

In [0]:
print(f'Best Score: {max(total_scores)}')

Testing KNN with CV

In [0]:
for i in range(1, 10):
  print(f'\nK = {i}\n')
  knc = KNeighborsClassifier(n_neighbors=i)
  scores_mean = []
  scores_max = []
  scores_min = []
  cv_range = range(2, 25, 1)

  for i in cv_range: 
    scores = cross_val_score(knc, X, y, cv=i)
    scores_mean.append(scores.mean())
    scores_max.append(max(scores))
    scores_min.append(min(scores))
    print(f'cv = {i}, score mean = {scores.mean()}, max = {max(scores)}, min = {min(scores)}')

  plt.figure()
  plt.plot(cv_range, scores_mean, color="purple", linewidth=2)
  plt.plot(cv_range, scores_min, color="red", linewidth=2)
  plt.plot(cv_range, scores_max, color="blue", linewidth=2)
  plt.xlabel("k")
  plt.ylabel("score")
  plt.title("k vs scores")
  plt.legend()
  plt.show()

  best_cv = scores_mean.index(max(scores_mean)) + 2
  print(f'highest score: {max(scores_mean)} with {best_cv} k')

KNN Confusion Matrix

In [0]:
from sklearn.metrics import confusion_matrix,plot_confusion_matrix

labels = ['True', 'False']

def convert(i):
  return labels[i]
knc = KNeighborsClassifier(n_neighbors=best_k)
knc.fit(X_train, y_train)
ytest_pred = knc.predict(X_test)

ytest_pred = list(map(convert, ytest_pred))
ytest_true = list(map(convert, y_test))
print(ytest_pred)
print(ytest_true)
cmatrix = confusion_matrix(ytest_true, ytest_pred, labels=labels) #use labels name as well

print(cmatrix)

sns.heatmap(data = cmatrix, xticklabels = labels, yticklabels = labels, annot = True, cbar = True)

Cross Validation Score Function for use by SVM

In [0]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold, cross_val_score
from sklearn import model_selection

def models_cross_val(models, k_list, X, y):
  results = []
  names = []
  for name, model in models:
    print()
    print(name)
    for k in (k_list):
      names.append(name + ' k=' + str(k))
      kfold = model_selection.KFold(n_splits=k)
      scores = model_selection.cross_val_score(model, X, y, cv=kfold, scoring='accuracy')
      results.append(scores)
      print(f'k = {k}: mean = {scores.mean()}, std = {scores.std()}, max = {max(scores)}, min = {min(scores)}')
  return results, names

SVM kernels

In [0]:
print('Cross Validation Scores for SVM kernels')

models = []
models.append(('linear' , SVC(kernel='linear')))
models.append(('poly' , SVC(kernel='poly')))
models.append(('rbf' , SVC(kernel='rbf')))
models.append(('sigmoid' , SVC(kernel='sigmoid')))

# k_list = [5,10]
k_list = [10]

results_svm, names_svm = models_cross_val(models, k_list, X, y)

Plot.

In [0]:
import matplotlib.pyplot as plt
import numpy as np

labels = [5, 10]

fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(20, 5), sharey=True, )

# linear
bplot1 = axes[0].boxplot( [results_svm[0], results_svm[1]] ,
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=labels)  # will be used to label x-ticks
axes[0].set_title('linear')

# poly
bplot2 = axes[1].boxplot( [results_svm[2], results_svm[3]] ,
                         vert=True,
                         patch_artist=True,
                         labels=labels)
axes[1].set_title('poly')

# rbf
bplot2 = axes[2].boxplot( [results_svm[4], results_svm[5]] ,
                         vert=True,
                         patch_artist=True,
                         labels=labels)
axes[2].set_title('rbf')

# sigmoid
bplot2 = axes[3].boxplot( [results_svm[6], results_svm[7]] ,
                         vert=True,
                         patch_artist=True,
                         labels=labels)
axes[3].set_title('sigmoid')


# fill with colors
colors = ['lightblue', 'lightgreen']
for bplot in (bplot1, bplot2):
    for patch, color in zip(bplot['boxes'], colors):
        patch.set_facecolor(color)

# horizontal grid lines
for ax in axes:
    ax.yaxis.grid(True)
    ax.set_xlabel('k-fold')
    ax.set_ylabel('Score')

plt.show()

SVC parameter selection function


In [0]:
from sklearn import svm
from sklearn.model_selection import GridSearchCV

def svc_param_selection(X, y, nfolds):
    Cs = [0.1, 1, 10]
    gammas = [0.1, 1, 10]
    param_grid = {'C': Cs, 'gamma' : gammas}
    grid_search = GridSearchCV(svm.SVC(kernel='rbf'), param_grid, cv=nfolds)
    grid_search.fit(X, y)
    grid_search.best_params_
    return grid_search.best_params_, grid_search.best_score_, grid_search



Display the best SVC Tuning parameters

In [0]:
params, score, grid_search = svc_param_selection(X, y, 5)
print(params)

In [0]:
print(f'{score}')

models = []
models.append(('rbf' , SVC(kernel='rbf', C=1, gamma=1)))
k_list = [5,10,25]

results_svm, names_svm = models_cross_val(models, k_list, X, y)

In [0]:
import matplotlib.pyplot as plt
import numpy as np

labels = [5, 10, 25]

fig, axes = plt.subplots(nrows=1, ncols=1, figsize=(10, 5) )

# rbf
bplot1 = axes.boxplot( [ results_svm[0], results_svm[1], results_svm[2] ],
                         vert=True,  # vertical box alignment
                         patch_artist=True,  # fill with color
                         labels=labels)  # will be used to label x-ticks
axes.set_title('rbf kernel cv scores (c=1, gamma=1)')

# fill with colors
colors = ['lightblue', 'lightgreen', 'lightpink']
for patch, color in zip(bplot1['boxes'], colors):
    patch.set_facecolor(color)

# horizontal grid lines
axes.yaxis.grid(True)
axes.set_xlabel('k-fold')
axes.set_ylabel('Score')

plt.show()

In [0]:
Train Test Split

In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

SVM kernel - determine Overfitting

In [0]:
from sklearn.svm import SVC

svc = SVC(kernel='rbf', random_state=42)
svc.fit(X_train, y_train)

print(f'training score: {svc.score(X_train, y_train)}')
print(f'testing score: {svc.score(X_test, y_test)}')

ROC Curve for SVM Train/Test split

In [0]:
from sklearn.metrics import plot_roc_curve
plot_roc_curve(svc, X_test, y_test)

import matplotlib.pyplot as plt
plt.show()

Naive Bayes


Preprocessing


In [0]:
import pandas as pd
import numpy as np
import string
import re
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk_stopwords = stopwords.words('english')
remove_punctuation = '!"$%&\'()*+,-./:;<=>?@[\\]“”^_`{|}~’'

def clean_column(dataframe, column_to_clean, new_col):
    df_copy = dataframe.copy()
    df_copy['copied_column'] = df_copy[column_to_clean]
    df_copy['copied_column'] = df_copy['copied_column'].str.lower()
    cleaned_column = []
    for label in df_copy.index:
        row = df_copy.loc[label, :]['copied_column']
        clean = [x for x in row.split() if x not in string.punctuation]
        clean = [x for x in clean if x not in nltk_stopwords]
        clean = [x for x in clean if x not in string.digits]
        clean = [x for x in clean if x not in remove_punctuation]
        clean = [x for x in clean if len(x) != 1]
        clean = " ".join(clean)
        clean = clean.strip()
        cleaned_column.append(clean)
    df_copy[new_col] = cleaned_column
    del df_copy['copied_column']
    return df_copy

def filtration(dataframe, column):
    # clean = list(map(lambda x: x.replace("#", ""), clean)) #we want to maintain hashtags!
    dataframe[column] = dataframe[column].apply(lambda x: x.replace('"', ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("’", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(":", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("…", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(".",""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("⋆", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ⋆ ", " "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("  ", " "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("$", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(",", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" alime ", " all time "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" alltime ", " all time "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(";", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("alime", "all time "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("atm", "at the moment"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ath ", " all time high "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("str8", "straight"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" v ", " very "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" #d", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ddos ", " distributed denial of service "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("btce", "btc"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("bitcoina", "bitcoin"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("rbitcoin", "bitcoin"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" – ", " "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("-&gt;", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ➤ ", " "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("◄►", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("◄", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ur ", " your "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" u ", " you "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("forthen", "for then"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("&gt;", "greater than"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("&lt;", "less than"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("lt", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("gt", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(":", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("&amp;", "and"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ampamp", "and"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" amp ", " and "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("amp", "and"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" bu ", " but "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("/", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("...", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("(", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(")", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("“", '"'))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("”", '"'))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("‘", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("’", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("-"," "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("*", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("!", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("⬛️", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\u200d", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\U0001f986", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\U0001f942", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\U0001f92f", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\U0001f911", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("\U0001F193", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ⭕ ", " "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("🤔", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("☞ ", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("[", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("]", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("{", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("}", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ô", "o"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ó", "o"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("é", "e"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ï","i"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("®", ""))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("á", "a"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ã", "a"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace("ç", "c"))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" jan ", " january "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" feb ", " february "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" mar ", " march "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" apr ", " april "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" jun ", " june "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" jul ", " july "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" aug ", " august "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" sept ", " september "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" oct ", " october "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" nov ", " november "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" dec ", " december "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" washinon ", " washington "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" dming ", " direct messaging "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" cust ", " customer "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" wcust ", " with customer "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" cc ", " credit card "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" gopros ", " go pros "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ultimatelyi ", " ultimately i "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 1hr ", " one hour "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" rep ", " representative "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" wunited ", " with united "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" mp# ", " mileage plus number "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" hrs ", " hours "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 4hours ", " four hours "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" laxewr ", " lax ewr "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" iadlax ", " iad lax "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" julystill ", " july still "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 30mins ", " 30 minutes "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" mins ", " minutes "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 5hours ", " 5 hours "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" checkhowever ", " check however "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" familyno ", " family "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 2nd ", " second "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 6hour ", " six hour "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" cuz ", " because "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" cause ", " because "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" ideabuy ", " idea buy "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" fixem ", " fix them "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" properthey ", " proper they "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" americanair ", " american air "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" yea ", " yes "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" gnteed ", " guaranteed "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 6mo ", " 6 months "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" believei ", " believe "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" btw ", " by the way "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" intl ", " international "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" thxs ", " thanks "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" plususual ", " plus usual "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" fridaycant ", " friday can not "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" lhr ", " 1 hour "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" wheelsup ", " wheels up "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" tryna ", " try and "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 2hours ", " 2 hours "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" 1st ", " first "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" creditcard ", " credit card "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" luv ", " love "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" obv ", " obviously "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" patientyou ", " patient you "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" youwe ", " you have "))
    dataframe[column] = dataframe[column].apply(lambda x: x.replace(" uraniumone ", " uranium one "))

In [0]:
df1 = df_original 
df1.head()

In [0]:
df1 = clean_column(df1, 'title', 'clean_title')

In [0]:
df1 = clean_column(df1, 'text', 'clean_text')

In [0]:
filtration(df1, 'clean_title')

In [0]:
filtration(df1, 'clean_text')

In [0]:
df1.head()

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from collections import Counter
import nltk
from wordcloud import WordCloud
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, accuracy_score, recall_score, precision_score

In [0]:
X_body_text = df1['clean_text'].values
X_title_text = df1['clean_title'].values
y = df1['fake'].values

In [0]:
tfidf = TfidfVectorizer(ngram_range=(1,2), max_df= 0.85, min_df= 0.01)

In [0]:
X_body_tfidf = tfidf.fit_transform(X_body_text)
X_title_tfidf = tfidf.fit_transform (X_title_text)

In [0]:
indices = df1.index.values

In [0]:
X_body_tfidf_train, X_body_tfidf_test, \
y_body_train, y_body_test, \
indices_body_train, indices_body_test = train_test_split(X_body_tfidf, y, indices, test_size = 0.2, random_state=42)

In [0]:
df1.loc[indices_body_train].groupby('fake').agg('count')

In [0]:
df1.loc[indices_body_test].groupby('fake').agg('count')

In [0]:
from sklearn.naive_bayes import MultinomialNB
nb_body = MultinomialNB()
nb_body.fit(X_body_tfidf_train, y_body_train)
y_body_train_pred = nb_body.predict(X_body_tfidf_train)

In [0]:
print('Naive Bayes In Training data F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_body_train, y_body_train_pred, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_body_train, y_body_train_pred)*100))

In [0]:
np.where(y_body_train != y_body_train_pred)

In [0]:
y_body_pred = nb_body.predict(X_body_tfidf_test)

In [0]:
# print metrics
print('Naive Bayes Test F1 and Accuracy Scores:')
print('F1 score {:.4}%'.format(f1_score(y_body_test, y_body_pred, average='macro')*100 ))
print ('Accuracy score {:.4}%'.format(accuracy_score(y_body_test, y_body_pred)*100))

In [0]:
from sklearn.naive_bayes import MultinomialNB
nb_body = MultinomialNB()
nb_body.fit(X_body_tfidf_train, y_body_train)
y_body_train_pred = nb_body.predict(X_body_tfidf_train)



# model.fit(train.data, train.target)
# labels = model.predict(test.data)

from sklearn.metrics import confusion_matrix
mat = confusion_matrix(y_body_train, y_body_train_pred)
sns.heatmap(mat.T, square=True, annot=True, fmt='d', cbar=False,
            xticklabels=X_body_tfidf_train.shape, yticklabels=y_body_train.shape)
plt.xlabel('true label')
plt.ylabel('predicted label');



In [0]:
import nltk
nltk.download('punkt')
fake = df1[df1['fake']==1]

spam_words = nltk.word_tokenize(" ".join(fake['clean_text'].values.tolist()))
spam_counter = Counter(spam_words)
print(spam_counter.most_common(50))

In [0]:
spam_wordcloud = WordCloud(width=1200, height=1000, random_state = 42).generate(" ".join(spam_words))
# wordcloud2 = WordCloud(width=1200, height=1000, collocations = False).generate(" ".join(spam_words)) # to turn off bigrams

fig = plt.figure(figsize=(20,10), facecolor = 'k')
plt.imshow(spam_wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [0]:
spam_bigrams = nltk.bigrams(spam_words)
spam_counter = Counter(spam_bigrams)
print(spam_counter.most_common(10))

In [0]:
ham = df1[df1['fake']==0]
ham_words = nltk.word_tokenize(" ".join(ham['clean_text'].values.tolist()))
ham_counter = Counter(ham_words)
print(ham_counter.most_common(50))

In [0]:
ham_wordcloud = WordCloud(width=1200, height=1000, random_state = 42).generate(" ".join(ham_words))

fig = plt.figure(figsize=(20,10), facecolor = 'k' )
plt.imshow(ham_wordcloud)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

In [0]:
ham_bigrams = nltk.bigrams(ham_words)
ham_counter = Counter(ham_bigrams)
print(ham_counter.most_common(10))

RNN

In [0]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import tensorflow as tf
import tensorflow_datasets as tfds
from sklearn.metrics import confusion_matrix, plot_confusion_matrix, roc_curve, auc
import random

In [0]:
df_original = pd.read_csv('news.csv')

len_before = len(df_original['title'])

#drop null values
df_original = df_original.dropna()
# dropping ALL duplicte values 
df_original.drop_duplicates(subset ="title", keep = 'first', inplace = True) 

len_after = len(df_original['title'])

print(f'number of rows removed: {len_before - len_after}')

#df_original.head()
df_original.head()

In [0]:
#Combine the title and the text
combined_text = []
temp = ''
for elm in df_original['title']:
  combined_text.append(elm + ' <TITLETOTEXT> ')
count = 0
for elm in df_original['text']:
  combined_text[count] = (combined_text[count] + elm)
  count = count + 1
count = 0

df_original['combined_text'] = combined_text
print(df_original['title'][0])
print(df_original['text'][0])
print(df_original['combined_text'][0])

In [0]:
def get_equal_random_list(start, end, ratio):
  assert end > start, "end parameter must be more than start parameter"
  assert ratio >= 0.0 and ratio <= 1.0, "ratio must be between 0 and 1"
  random.seed(123)
  len_of_params = end - start
  num_of_elem = int(len_of_params * ratio)
  step = len_of_params / num_of_elem
  ret_list = []
  for i in range(0, num_of_elem):
    increment = int(step * i)
    ret_list.append(int(increment + (step * random.random())))
  random.shuffle(ret_list)
  return ret_list
'''
example input ---> get_equal_random_list(0, 100, 0.05)
output ---> [78, 98, 54, 5, 24]
formula ---> [0-19, 20-39, 40-59, 60-79, 80-99] in random order

'''
print(get_equal_random_list(0, 100, 0.05))

In [0]:
# Let us take only 10% of df to create our data set namely 'sample'
sample_size = 0.1
sample_combined_text = []
sample_fake = []
sample_list = get_equal_random_list(0, len(df_original['combined_text']), 0.05)
for i in sample_list:
  try:
    sample_combined_text.append(df_original['combined_text'][i])
    sample_fake.append(df_original['fake'][i])
  except:
    continue
sample = pd.DataFrame()
sample['combined_text'] = sample_combined_text
sample['fake'] = sample_fake
sample.head(10)
len(sample)
#sample = df.sample(int(0.1*len(df)))
#print(len(sample))
#sample.head(10)

In [0]:
# Let us create a tokenizer
tokenizer = tfds.features.text.Tokenizer()
vocabulary_set = set()
for text_tensor in df_original.combined_text.values:
  some_tokens = tokenizer.tokenize(text_tensor)
  vocabulary_set.update(some_tokens)

vocab_size = len(vocabulary_set)
# How many unique words do we have?
print(vocab_size)
print(vocabulary_set)


In [0]:
test_size = 0.1
test_data = sample.head(int(len(sample) * test_size))
test_data.head()
print(test_data.columns)

In [0]:
print(f'test: [0 - {int(len(sample) * test_size)}], train: [{int(len(sample) - (len(sample) * (1 - test_size)))} - {len(sample) - 1}]')
count = 0
for test in test_data['combined_text']:
  for train in train_data['combined_text']:
    if(train == test):
      print(f'equals at {count}\n{train}\n{test}')
  count = count + 1

In [0]:
print(f'test(true: {test_data.fake.values.tolist().count(0)}, fake: {test_data.fake.values.tolist().count(1)})')
print(f'train(true: {train_data.fake.values.tolist().count(0)}, fake: {train_data.fake.values.tolist().count(1)})')

In [0]:
#datagen_sample = tf.data.Dataset.from_tensor_slices((sample.combined_text, sample.fake))
datagen = tf.data.Dataset.from_tensor_slices((df_original.combined_text, df_original.fake))
datagen_train = tf.data.Dataset.from_tensor_slices((train_data.combined_text, train_data.fake))
datagen_test = tf.data.Dataset.from_tensor_slices((test_data.combined_text, test_data.fake))

In [0]:
# Now we will create our encoder
# This encoder will encode text to numbers
encoder = tfds.features.text.TokenTextEncoder(vocabulary_set)

# Lets see encoder in action
samplecombined_text = sample.combined_text.values[0]
encodedcombined_text = encoder.encode(sample.combined_text.values[0])
decodedcombined_text = encoder.decode(encodedcombined_text)
print(f'sample combined_text : \n{samplecombined_text}')
print(f'encode combined_text : \n{encodedcombined_text}')
print(f'decode combined_text : \n{decodedcombined_text}')

In [0]:
def encode(text_tensor, label):
  encoded_text = encoder.encode(text_tensor.numpy())
  return encoded_text, label

def encode_map_fn(text, label):
  # py_func doesn't set the shape of the returned tensors.
  encoded_text, label = tf.py_function(encode, inp=[text, label], Tout=(tf.int64, tf.int64))

  # `tf.data.Datasets` works best if all components have a shape set
  #  so set the shapes manually: 
  encoded_text.set_shape([None])
  label.set_shape([])

  return encoded_text, label



# Now let us map datagenerator to encode_map_fn
#sample_map = datagen.map(encode_map_fn)

train_data = datagen_train.map(encode_map_fn)
train_data = train_data.padded_batch(50)

test_data = datagen_test.map(encode_map_fn)
test_data = test_data.padded_batch(50)

# Now we will break our sample_map to train and test (90% and 10%)
#test train split
#make sure there is no overlap
'''
train_data = sample_map.skip(int(len(sample) * 0.9))
train_data = train_data.padded_batch(50)

test_data = sample_map.take(int(len(sample) * 0.1))
test_data = test_data.padded_batch(50)
print(train_data)
training_size = int(len(train_data))
testing_size = int(len(test_data))
print(f'training_size\t: {training_size}\ntesting_size\t: {testing_size}')
'''

data_map = datagen.map(encode_map_fn)
df_test = data_map.padded_batch(50)

In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(1)
])
model.summary()
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy'])

history = model.fit(train_data, epochs=15, validation_data=test_data)
test_loss, test_acc = model.evaluate(test_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

# plot train and validation loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

test_loss, test_acc = model.evaluate(df_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
labels = ['True', 'False']
model_pred = (model.predict(df_test) > 0.5).astype("int32")
#model_pred_sigmoid = model.predict(df_test)

model_true = df_original.fake.values
print(model_pred)
print(model_true)
cmatrix = confusion_matrix(model_true, model_pred) #use labels name as well

print(cmatrix)
sns.heatmap(data = cmatrix, xticklabels = labels, yticklabels = labels, annot = True, cbar = True, fmt = '0', cmap="YlGnBu")


In [0]:
fpr, tpr, threshold = roc_curve(model_true, model_pred)
area = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(f'Area Under Curve: {area}')

In [0]:
model = tf.keras.Sequential([
    tf.keras.layers.Embedding(encoder.vocab_size, 64),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1)
])
model.summary()
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True), optimizer=tf.keras.optimizers.Adam(1e-4), metrics=['accuracy'])

history = model.fit(train_data, epochs=10, validation_data=test_data)

test_loss, test_acc = model.evaluate(test_data)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
plt.plot(history.history['accuracy'], label='accuracy')
plt.plot(history.history['val_accuracy'], label = 'val_accuracy')
plt.title('Accuracy vs Epochs')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.ylim([0.5, 1])
plt.legend(loc='lower right')

# plot train and validation loss
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model train vs validation loss')
plt.ylabel('loss')
plt.xlabel('Epoch')
plt.legend(['train', 'validation'], loc='upper right')
plt.show()

test_loss, test_acc = model.evaluate(df_test)

print('Test Loss: {}'.format(test_loss))
print('Test Accuracy: {}'.format(test_acc))

In [0]:
labels = ['True', 'False']
model_pred = (model.predict(df_test) > 0.5).astype("int32")
#model_pred_sigmoid = model.predict(df_test)

model_true = df_original.fake.values
print(model_pred)
print(model_true)
cmatrix = confusion_matrix(model_true, model_pred) #use labels name as well

print(cmatrix)
sns.heatmap(data = cmatrix, xticklabels = labels, yticklabels = labels, annot = True, cbar = True, fmt = '0', cmap="YlGnBu")


In [0]:
fpr, tpr, threshold = roc_curve(model_true, model_pred)
area = auc(fpr, tpr)

import matplotlib.pyplot as plt
plt.title('Receiver Operating Characteristic')
plt.plot(fpr, tpr, 'b')
plt.legend(loc = 'lower right')
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0, 1])
plt.ylim([0, 1])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
print(f'Area Under Curve: {area}')