In [1]:
import pandas 
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,accuracy_score,f1_score,precision_score,recall_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

# read csvs and make dataframes

In [2]:
true_news_df = pandas.read_csv("News/True.csv")
fake_news_df = pandas.read_csv("News/Fake.csv")

In [3]:
true_news_df["data"] = true_news_df["title"] + true_news_df["text"]
fake_news_df["data"] = fake_news_df["title"] + fake_news_df["text"]

# preprocess data
remove single character words and stop_words and lemmatize each word

In [4]:
stop_words = nltk.corpus.stopwords.words('english')
lemmatize = nltk.stem.WordNetLemmatizer().lemmatize #function lemmatize
def preprocess(data_list):
    data_tokens_list = []
    tokenizer = nltk.tokenize.RegexpTokenizer('\w+')
    for data in data_list:
        data_tokens = " ".join([lemmatize(token) for token in tokenizer.tokenize(data) if len(token)>1 and token.lower() not in stop_words])
        data_tokens_list.append(data_tokens)
        
        
    return data_tokens_list
        

In [5]:
true_news_df = pandas.DataFrame({
    "content":preprocess(true_news_df["data"]),
    "lable":1 # lable for True 
})

In [6]:
fake_news_df = pandas.DataFrame({
    "content":preprocess(fake_news_df["data"]),
    "lable":0 # lable for fake 
})

In [7]:
news_df = pandas.concat([true_news_df , fake_news_df])

In [8]:
news_df

Unnamed: 0,content,lable
0,budget fight loom Republicans flip fiscal scri...,1
1,military accept transgender recruit Monday Pen...,1
2,Senior Republican senator Let Mr Mueller job W...,1
3,FBI Russia probe helped Australian diplomat ti...,1
4,Trump want Postal Service charge much Amazon s...,1
...,...,...
23476,McPain John McCain Furious Iran Treated US Sai...,0
23477,JUSTICE Yahoo Settles mail Privacy Class actio...,0
23478,Sunnistan US Allied Safe Zone Plan Take Territ...,0
23479,Blow 700 Million Al Jazeera America Finally Ca...,0


# split test and trains

In [9]:
train_contents ,test_contents ,train_lables,test_lables = train_test_split(news_df["content"],news_df["lable"],test_size=0.2,shuffle=True)

# classification reports

In [10]:
def report_of_classifier(train_data , train_lables , test_data , test_lables , classifier):
    classifier.fit(train_data,train_lables)
    predicts = classifier.predict(test_data)
    conf_matrix = confusion_matrix(test_lables,predicts)
    print(f"accuracy = {accuracy_score(test_lables,predicts)}\n",    
          f"precision = {precision_score(test_lables,predicts)}\n",
          f"recall = {recall_score(test_lables,predicts)}\n",
          f"f1_score = {f1_score(test_lables,predicts)}\n",
          f"confusion_matrix = [{conf_matrix[0,0]}  {conf_matrix[0,1]}\n",
          f"                    {conf_matrix[1,0]}  {conf_matrix[1,1]}]"
         )
    

# sentence to vectors and classification

## with Bag_of_words

In [11]:
bag_of_words_vectorizer = CountVectorizer(ngram_range=(2,2))
vectors = bag_of_words_vectorizer.fit_transform( train_contents.to_list()+test_contents.to_list())

In [12]:
train_vectors = vectors[:len(train_contents)]
test_vectors = vectors[len(train_contents):]

### with linear svm algorithm

In [13]:
report_of_classifier(train_vectors,train_lables,test_vectors,test_lables,SVC(kernel="linear"))

accuracy = 0.9789532293986637
 precision = 0.9827748938178386
 recall = 0.9729035272132679
 f1_score = 0.9778142974527526
 confusion_matrix = [4626  73
                     116  4165]


#### with Naive Bayes algorithm

In [14]:
report_of_classifier(train_vectors,train_lables,test_vectors,test_lables,MultinomialNB())

accuracy = 0.9790645879732739
 precision = 0.9684138246738384
 recall = 0.9883204858677879
 f1_score = 0.9782658959537572
 confusion_matrix = [4561  138
                     50  4231]


## with_idf

In [15]:
tf_idf_vectorizer = TfidfVectorizer()
vectors = tf_idf_vectorizer.fit_transform( train_contents.to_list()+test_contents.to_list())
train_vectors = vectors[:len(train_contents)]
test_vectors = vectors[len(train_contents):]

In [16]:
report_of_classifier(train_vectors,train_lables,test_vectors,test_lables,SVC(kernel="linear"))

accuracy = 0.9944320712694877
 precision = 0.9923202234116826
 recall = 0.9960289651950479
 f1_score = 0.9941711354628118
 confusion_matrix = [4666  33
                     17  4264]


In [17]:
report_of_classifier(train_vectors,train_lables,test_vectors,test_lables,MultinomialNB())

accuracy = 0.9400890868596882
 precision = 0.9465521355285135
 recall = 0.926652651249708
 f1_score = 0.9364966949952784
 confusion_matrix = [4475  224
                     314  3967]
