In [1]:
import pandas as pd
train=pd.read_csv('fake_or_real_news_training.csv')
test=pd.read_csv('fake_or_real_news_test.csv')

In [2]:
import html
import re
import codecs
import string
import subprocess 
import nltk
from nltk.stem.snowball import EnglishStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.tokenize import TreebankWordTokenizer
from nltk.stem import WordNetLemmatizer
from string import digits
from nltk.corpus import stopwords

###########################################
##### Define text cleaning function #######
###########################################
def text_cleaning(text, escape_list=[], stop=[]):
    l=[]
    """
    Text cleaning function:
        Input: 
            -text: a string variable, the text to be cleaned
            -escape_list : words not to transform by the cleaning process (only lowcase transformation is needed)  
            -stop : custom stopwords
        Output:
            -text cleaned and stemmed           
    """
    
    
    """ Get stop word list from package"""
    #STOPWORDS ARE COMMENTED
    #StopWords = list(set(stopwords.words('english')))
    StopWords=[]
    custom_stop = StopWords + stop
    
    """ Step 1: Parse html entities"""
    text = html.unescape(text)
    text=text.replace('\n',' ').replace('\t',' ').replace('â€™','')
    
    
    
    """ Step 2: Decode special caracters"""
    text = text.encode('utf8').decode('unicode_escape')
    
 
    """ Step 3: Tokenise text: spliting text elements with the TreeBankWordTokenizer method"""
    tokenizer = TreebankWordTokenizer()
    tokenz=[','.join(tokenizer.tokenize(mot)) if mot  not in escape_list else mot  for mot in text.split()  ]
    
    
    """ Step 4: Drop punctuations """
    tokenz=[re.sub(r'[^\w\s]',' ',mot) if mot  not in escape_list else mot  for mot in tokenz  ]
    tokenz = ' '.join(tokenz).split()
       
    """ Step 5.1: Remove stop words """
    tokenz=([token for token in tokenz if token not in custom_stop])
    
    
    """ Step 5.2: Delete digits from text """
    #tokenz=([token for token in tokenz if (  (token.isdigit())==False)  ])  

    """ Step 5.3: Remove digits from tokens"""
    #remove_digits = str.maketrans('', '', digits)
    #tokenz=[token.translate(remove_digits)  if token not in  escape_list else token for token in tokenz   ]
    
    """ Step 6.1: Lowcase the text"""
    tokenz=([token.lower() for token in tokenz])
    
    """ Step 6.2: Lemmatize the text 
     
'''tokenz=[WordNetLemmatizer().lemmatize(token) if token not in escape_list else token for token in tokenz ]'''"""
    """ Step 6.2: Stem the text """
    tokenz=[EnglishStemmer().stem(token) if token not in escape_list else token for token in tokenz ]

    """ Step 6.3: Drop words with one caratcter and proceed last check for stop words after Stemming"""
    tokenz=[token for token in tokenz if (token not in  custom_stop and len(token)>1) ]

    return ' '.join(tokenz)

In [3]:
train.columns

Index(['ID', 'title', 'text', 'label', 'X1', 'X2'], dtype='object')

In [4]:
escape_list=[]
stop=[]
#Cleaning the train set
train['title']=train['title'].apply(text_cleaning,args=(escape_list,stop))
train['text']=train['text'].apply(text_cleaning,args=(escape_list,stop))



In [6]:
len(train[train['label']=='REAL'])

1990

In [7]:
len(train[train['label']=='FAKE'])

1976

In [8]:
train_clean=train[(train['label']=='REAL')|(train['label']=='FAKE')]

In [9]:
train_clean['label'].value_counts()

REAL    1990
FAKE    1976
Name: label, dtype: int64

In [10]:
#train_clean=pd.read_csv('train_clean.csv')

In [11]:
X=train_clean[['title','text']]
y=pd.get_dummies(train_clean['label'],drop_first=True)

<h3>Two ways you can do this. One is the easy one. Adding the title and text into one column and then using tfidf. Or use two column tfidf based on Pipeline. We will work with both and see their performance

In [12]:
#Adding title and text to one column.
X['title_text'] = X['title']+'  '+X['text']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
vec = TfidfVectorizer()
X_tfidf = vec.fit_transform(X['title_text']).toarray()

from sklearn.model_selection import train_test_split

#THIS LINE IS WHERE WE ARE SPLITTING
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.20, random_state=13)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
from sklearn.metrics import classification_report
predicted=logmodel.predict(X_test)
print('The classification Report:- \n')
print(classification_report(y_test,predicted))
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:- \n')
print(confusion_matrix(y_test,predicted))
from sklearn import metrics
#Print model report:
print("\nAccuracy : %.4g" % metrics.accuracy_score(y_test,predicted))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test,predicted))

  y = column_or_1d(y, warn=True)


The classification Report:- 

             precision    recall  f1-score   support

          0       0.89      0.94      0.91       418
          1       0.93      0.88      0.90       376

avg / total       0.91      0.91      0.91       794

Confusion Matrix:- 

[[392  26]
 [ 47 329]]

Accuracy : 0.9081
AUC Score (Train): 0.906400


<h1>The hard way using two columns: Title and Text

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import FeatureUnion, Pipeline

In [15]:
transformer = FeatureUnion([
                ('title_tfidf', 
                  Pipeline([('extract_field',
                              FunctionTransformer(lambda x: x['title'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())])),
                ('text_tfidf', 
                  Pipeline([('extract_field', 
                              FunctionTransformer(lambda x: x['text'], 
                                                  validate=False)),
                            ('tfidf', 
                              TfidfVectorizer())]))]) 

transformer.fit(X[['title','text']])

FeatureUnion(n_jobs=1,
       transformer_list=[('title_tfidf', Pipeline(memory=None,
     steps=[('extract_field', FunctionTransformer(accept_sparse=False,
          func=<function <lambda> at 0x000001F8EFC2CB70>, inv_kw_args=None,
          inverse_func=None, kw_args=None, pass_y='deprecated',
          validate=False)), ('tfi...      token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None))]))],
       transformer_weights=None)

In [16]:
X_tfidf_two=transformer.transform(X[['title','text']]).toarray()

In [17]:
X_tfidf_two.shape

(3966, 48329)

In [18]:
X_tfidf.shape

(3966, 42560)

In [19]:
y.shape

(3966, 1)

In [20]:
#Logistic Regression using Two Columns Chcek out the betterment in result
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_two, y, test_size=0.20, random_state=13)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_train,y_train)
from sklearn.metrics import classification_report
predicted=logmodel.predict(X_test)
print('The classification Report:- \n')
print(classification_report(y_test,predicted))
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:- \n')
print(confusion_matrix(y_test,predicted))
from sklearn import metrics
#Print model report:
print("\nAccuracy : %.4g" % metrics.accuracy_score(y_test,predicted))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test,predicted))

  y = column_or_1d(y, warn=True)


The classification Report:- 

             precision    recall  f1-score   support

          0       0.90      0.94      0.92       418
          1       0.94      0.89      0.91       376

avg / total       0.92      0.92      0.92       794

Confusion Matrix:- 

[[395  23]
 [ 42 334]]

Accuracy : 0.9181
AUC Score (Train): 0.916637


In [24]:
#Support Vector Machine with 'Linear' Kernel
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf_two, y, test_size=0.20, random_state=13)

# Fitting SVM classifier to the Training set
#SVM will take some time to train.DON'T WORRY
from sklearn.svm import SVC
classifier = SVC(kernel='linear', random_state=0)
classifier.fit(X_train, y_train)

# Predicting the Test set results
predicted = classifier.predict(X_test)
print('The classification Report:- \n')
print(classification_report(y_test,predicted))
from sklearn.metrics import confusion_matrix
print('Confusion Matrix:- \n')
print(confusion_matrix(y_test,predicted))
from sklearn import metrics
#Print model report:
print("\nAccuracy : %.4g" % metrics.accuracy_score(y_test,predicted))
print("AUC Score (Train): %f" % metrics.roc_auc_score(y_test,predicted))

  y = column_or_1d(y, warn=True)


The classification Report:- 

             precision    recall  f1-score   support

          0       0.90      0.94      0.92       418
          1       0.94      0.89      0.91       376

avg / total       0.92      0.92      0.92       794

Confusion Matrix:- 

[[395  23]
 [ 42 334]]

Accuracy : 0.9181
AUC Score (Train): 0.916637


# Before testing on test data,let us train the model on the entire data

In [30]:
#Cleaning the test set`
test['title']=test['title'].apply(text_cleaning,args=(escape_list,stop))
test['text']=test['text'].apply(text_cleaning,args=(escape_list,stop))

In [31]:
#For testing the Test File
from sklearn.linear_model import LogisticRegression
logmodel = LogisticRegression()
logmodel.fit(X_tfidf_two, y)
test_tfidf_two=transformer.transform(test[['title','text']]).toarray()
test_predicted=logmodel.predict(test_tfidf_two)

  y = column_or_1d(y, warn=True)


In [36]:
#placing the predictions in test dataset
test['prediction']=test_predicted

In [39]:
#making the 1s as REAL and 0s as FAKE
test['prediction']=test['prediction'].apply(lambda x: 'REAL' if x ==1 else 'FAKE' )

In [46]:
#Making the dataset as csv for submission
test.loc[:,['ID','prediction']].to_csv('Submission.csv',index=False)