In [2]:
import pandas as pd
data = pd.read_csv("https://raw.githubusercontent.com/yildirimcaglar/yildirimcaglar.github.io/master/ds3000/fake_news_data.csv")
data

Unnamed: 0,Headline,Label
0,Says the Annies List political group supports ...,0
1,Health care reform legislation is likely to ma...,0
2,The Chicago Bears have had more starting quart...,1
3,When Mitt Romney was governor of Massachusetts...,0
4,McCain opposed a requirement that the governme...,1
...,...,...
4549,Says Barack Obama promised to halve the defici...,1
4550,I am the only senator who turned down the stat...,1
4551,There is no system to vet refugees from the Mi...,0
4552,I think its seven or eight of the California s...,0


In [3]:
# here are the target value counts
data["Label"].value_counts()

0    2501
1    2053
Name: Label, dtype: int64

In [4]:
def sample_df_equally_by_group(df, column, n):
    
    g = df.groupby(column).sample(n)
    
    
    return g

In [5]:
final_data = sample_df_equally_by_group(df=data, column="Label", n=2050)

In [6]:
final_data

Unnamed: 0,Headline,Label
3079,The Colorado caucus system for selecting Repub...,0
971,Says that under City Council Member Randi Shad...,0
2633,George Allens flat tax plan would actually shr...,0
3418,The sequester has already lost 1.6 million jobs.,0
528,"Says that if Texas, California and New York al...",0
...,...,...
3094,On oil drilling,1
4261,Sen. McCain was already turning his sights to ...,1
2580,The 2012 National Survey on Drug Use and Healt...,1
2999,I became a Republican sooner in my life than R...,1


Here are the final counts in the sampled dataset:

In [7]:
final_data["Label"].value_counts()

0    2050
1    2050
Name: Label, dtype: int64

In [8]:
import stylecloud

true_df = final_data[final_data['Label'] == 0]

tr_tx = true_df["Headline"].values

tr_f = ''.join(tr_tx)

false_df = final_data[final_data['Label'] == 1]

fa_tx = true_df["Headline"].values

fa_f = ''.join(fa_tx)


true_sc = stylecloud.gen_stylecloud(text=tr_f,
                          output_name="vis_true_headlines.png")



false_sc = stylecloud.gen_stylecloud(text=fa_f,
                          output_name="vis_false_headlines.png")

FALSE Cloud

<img src="https://i.ibb.co/kgbRKvh/vis-false-headlines.png" alt="vis-false-headlines" width=300>


TRUE Cloud

<img src="https://i.ibb.co/3rX79N7/vis-true-headlines.png" alt="vis-false-headlines" width=300>


In [9]:
def split_data(df, feature_column, target_column):
    from sklearn.model_selection import train_test_split
    
    features = df[feature_column]
    target = df[target_column]
    
    return train_test_split(features, target, random_state=3000)
    

In [10]:
X_train, X_test, y_train, y_test = split_data(df=final_data, 
                                              feature_column="Headline", 
                                              target_column="Label")

In [11]:
X_train.shape

(3075,)

In [12]:
X_test.shape

(1025,)

In [57]:
def text_vectorizer(train_set, test_set, vectorizer):
    from sklearn.feature_extraction.text import CountVectorizer
    from sklearn.feature_extraction.text import TfidfVectorizer
    
    if vectorizer == "count":
        vect = CountVectorizer().fit(train_set)

        X_train_vectorized = vect.transform(train_set)
        X_test_vectorized = vect.transform(test_set)
        
    else:
        
        vect = TfidfVectorizer().fit(X_train)

        X_train_vectorized = vect.transform(train_set)
        X_test_vectorized = vect.transform(test_set)
        
    return (X_train_vectorized, X_test_vectorized)

In [58]:
X_train_vectorized, X_test_vectorized = text_vectorizer(train_set=X_train, 
                                                        test_set=X_test, 
                                                        vectorizer = "tfidf")

In [59]:
X_train_vectorized.toarray().shape

(3075, 6892)

In [60]:
X_test_vectorized.toarray().shape

(1025, 6892)

In [61]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

estimators = {
    'Logistic Regression': LogisticRegression(),
    'Multinomial Naive Bayes': MultinomialNB(),
    'Decision Tree': DecisionTreeClassifier()}

for estimator_name, estimator_object in estimators.items():
    
        
        model = estimator_object.fit(X=X_train_vectorized, y=y_train)
        
    
        print(estimator_name + ": \n\t" + 
              f'R-squared value for training set: {model.score(X_train_vectorized, y_train)}' + "\n\t" +
             f'R-squared value for testing set: {model.score(X_test_vectorized, y_test)}' + "\n")



Logistic Regression: 
	R-squared value for training set: 0.8504065040650407
	R-squared value for testing set: 0.6224390243902439

Multinomial Naive Bayes: 
	R-squared value for training set: 0.8819512195121951
	R-squared value for testing set: 0.6117073170731707

Decision Tree: 
	R-squared value for training set: 0.9996747967479674
	R-squared value for testing set: 0.5521951219512196



In [62]:
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import r2_score



vect = TfidfVectorizer(min_df=5, ngram_range=(1,2), stop_words = "english").fit(X_train)


X_train_vectorized = vect.transform(X_train)
X_test_vectorized = vect.transform(X_test)

model = LogisticRegression().fit(X=X_train_vectorized, y=y_train)

print("Classification accuracy on training set: ",model.score(X_train_vectorized, y_train))
      
print("Classification accuracy on testing set: ",model.score(X_test_vectorized, y_test))


Classification accuracy on training set:  0.7778861788617886
Classification accuracy on testing set:  0.6068292682926829


In [91]:
def headline_checker(headline):
    
    headline_features = vect.transform(headline)
    
    sentiment = model.predict(headline_features)
    
    p_list = model.predict_proba(headline_features)
    
    if sentiment == 1:
        print("Model classification: Real News\n Probability: %.2f" % (p_list[0][1]))
    else:
        print("Model classification: Fake News\n Probability: %.2f" % (p_list[0][0]))
    
   

In [92]:
headline_checker(["The State adds new vaccine requirement for senate members"])

Model classification: Real News
 Probability: 0.76


In [93]:
headline_checker(["Wisconsin Governer says he will never campaign again"])

Model classification: Fake News
 Probability: 0.79
