In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import calendar
import string
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
true = pd.read_csv("True.csv")
true["label"] = 1
fake = pd.read_csv("Fake.csv")
fake["label"] = 0
main = [true, fake]
main = pd.concat(main, ignore_index=True)
main = main.drop(["date", "subject"], axis=1)
main["length"] = main['text'].apply(lambda x: len(x) - x.count(" "))

In [None]:
print("Input data has {} rows and {} columns".format(len(main), len(main.columns)))
print("Out of {} rows, {} are spam, {} are ham".format(len(main), len(main[main['label'] == 0]),
                                                       len(main[main['label'] == 1])))
print("Number of null in label: {}".format(main['label'].isnull().sum()))
print("Number of null in text: {}".format(main['text'].isnull().sum()))

In [None]:
wn = nltk.WordNetLemmatizer()
# ps = nltk.PorterStemmer()
stoplist = set(stopwords.words("english"))
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = [wn.lemmatize(word) for word in tokens if word not in stoplist]
    return text

In [None]:
tfidf_vect = TfidfVectorizer(analyzer=clean_text)
X_tfidf = tfidf_vect.fit_transform(main['text'])
print(X_tfidf.shape)
print(tfidf_vect.get_feature_names())
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names()
X_tfidf_df

In [None]:
# Visualizing the true data
true.subject.value_counts().plot.bar()
plt.title("True news articles for different subjects")
plt.xticks(rotation=0)
plt.show()
fake.subject.value_counts().plot.bar()
plt.title("Fake news articles for different subjects")
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()
bins = np.linspace(0, 10000, 40)
plt.hist(main[main['label'] == 0]['length'], bins, alpha=0.5, label='fake', density=True)
plt.hist(main[main['label'] == 1]['length'], bins, alpha=0.5, label='true', density=True)
plt.legend()
plt.show()