In [28]:
pip install seaborn nltk 

Note: you may need to restart the kernel to use updated packages.


In [29]:
import os
import pandas as pd
import seaborn as sns 
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
import nltk
from nltk.corpus import stopwords  
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score,precision_score,recall_score
from sklearn import metrics



# Data Analysis

In [30]:
df = pd.read_csv("data.csv")

In [31]:
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,target
0,0,Pete Buttigieg Admits Only Recently Realizing ...,1
1,1,Mom Changes Words Of Prayer To Be More Cheerful,1
2,2,Macy’s Parade Float Covered In Tickets After P...,1
3,3,Winter Storms Threaten Americans Traveling Bac...,1
4,4,White House Begins Christmas Season With Cerem...,1


In [32]:
df['Tweets_length'] = df['tweets'].str.len()
#drop null values
df = df.dropna(subset=['tweets'])

# Feature Engineering

## Text cleaning and preparation

In [33]:
# Downloading the stop words list
nltk.download('stopwords')

# Loading the stop words in english
stopword = nltk.corpus.stopwords.words('english')

def create_features_from_df(df):
    
    def remove_punct(text):
      text  = "".join([char for char in text if char not in string.punctuation])
      return text

    def clean_text(text):
      txt = re.sub("[( ' )( ')(' )]", ' ', text)
      txt=re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t]) | (\w +:\ / \ / \S +)", " ", txt)
      return txt.lower()

    def remove_stopwords(text):
      text  = " ".join([word for word in text.split(" ") if word not in stopword])
      return text

    df['new_tweets'] = df['tweets'].apply(lambda x: remove_punct(str(x)))
    df['new_tweets'] = df['new_tweets'].apply(lambda x: clean_text(str(x)))
    df['new_tweets'] = df['new_tweets'].apply(lambda x: remove_stopwords(str(x)))
    df.dropna()
    return df

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\guestnew\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [34]:
df.to_csv("Cleaned_Tweets.csv")

# Comparing Fake News Classifiers

##Transforming our collected data

In [35]:
# Load data
df = pd.read_csv("Cleaned_Tweets.csv")

In [36]:
df=df.dropna(how='any')

In [37]:
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tweets,target,Tweets_length
0,0,0,Pete Buttigieg Admits Only Recently Realizing ...,1,69.0
1,1,1,Mom Changes Words Of Prayer To Be More Cheerful,1,49.0
2,2,2,Macy’s Parade Float Covered In Tickets After P...,1,89.0
3,3,3,Winter Storms Threaten Americans Traveling Bac...,1,84.0
4,4,4,White House Begins Christmas Season With Cerem...,1,71.0


In [38]:
y = df.target
df = df.drop('target', axis=1)

In [39]:
X_train, X_test, y_train, y_test = train_test_split(df['tweets'], y, test_size=0.33, random_state=53)

In [40]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)
tfidf_train = tfidf_vectorizer.fit_transform(X_train)
tfidf_test = tfidf_vectorizer.transform(X_test)

##Training models

### LinearSVC

In [41]:
from sklearn.svm import LinearSVC
svc_tfidf_clf = LinearSVC()

svc_tfidf_clf.fit(tfidf_train, y_train)
pred = svc_tfidf_clf.predict(tfidf_test)
score = metrics.accuracy_score(y_test, pred)
score2 = metrics.f1_score(y_test, pred)

print("accuracy:   %0.3f" % score)
print("f1score:   %0.3f" % score2)
print("precision:   %0.3f" % metrics.precision_score(y_test,pred))
print("recall_score:   %0.3f" % metrics.recall_score(y_test,pred))

accuracy:   0.915
f1score:   0.899
precision:   0.909
recall_score:   0.890


# Saving model and functions

In [42]:
import pickle
#save tfidf to the disk
filename = 'cleaning_data.sav'
pickle.dump(create_features_from_df, open(filename, 'wb'))
# save the model to disk
filename = 'svc_model.sav'
pickle.dump(svc_tfidf_clf, open(filename, 'wb'))
#save tfidf vectorizer
filename = 'tfidf_vectorizer.sav'
pickle.dump(tfidf_vectorizer, open(filename, 'wb'))

#Introspecting models

# Tweets Classification

In [43]:
# load 
svc_model = pickle.load(open("svc_model.sav", 'rb'))
cleaning_data = pickle.load(open("cleaning_data.sav", 'rb'))
tfidf_vectorizer= pickle.load(open("tfidf_vectorizer.sav", 'rb'))


In [44]:
category_codes = {
    'Real': 0,
    'Fake': 1
}

In [45]:
# Tweets scraping
import tweepy
def get_tweets(item):
    #Twitter API credentials
    consumer_key =  "2y5779N6k5EZpOz3VmOyabJHc"
    consumer_secret = "q4DR8al72steNMS8Uf4PMmAU9sR0OTWMEwbu2DGZU35S7jN1ff"
    access_token = "1534057526483832832-cHWlsx6qpL9XlXt5zCSbG1pIom6lom"
    access_token_secret = "Kj7bDn8dlvQEloRflgpAtLC1IgOoF9dkWFluRfux5MlzD"
    
    OAUTH_KEYS = {'consumer_key':consumer_key, 'consumer_secret':consumer_secret,
    'access_token_key':access_token, 'access_token_secret':access_token_secret}
    auth = tweepy.OAuthHandler(OAUTH_KEYS['consumer_key'], OAUTH_KEYS['consumer_secret'])
    api = tweepy.API(auth, wait_on_rate_limit=True)

    # Request
    search = tweepy.Cursor(api.search_tweets, q=item).items(60)

    # Creation des listes pour chaque tweet
    sn = []
    text = []
    timestamp =[]
    for tweet in search:
        sn.append(tweet.user.screen_name)
        text.append(tweet.text)
        
    # df_features
    df_features = pd.DataFrame(
         {'tweets': text 
        })
    # df_show_info

    df_show_info = pd.DataFrame(

        {'User Screen Name': sn
        })
    
    return (df_features,df_show_info)

In [46]:
def get_category_name(category_id):
    for category, id_ in category_codes.items():    
        if id_ == category_id:
            return category

In [47]:
def predict_from_features(features):
        
    predictions_pre = svc_model.predict(features)

    predictions = []

    for cat in predictions_pre:
           predictions.append(cat)

    categories = [get_category_name(x) for x in predictions]
    
    return categories

In [48]:
def complete_df(df, categories):
    df['Prediction'] = categories
    return df

In [22]:
search=input("Enter the Key : ")
(df_features,df_show_info) = get_tweets(search)

In [23]:
df_features = cleaning_data(df_features)
df_features.head()

Unnamed: 0,tweets,new_tweets
0,RT @parkjwootwts: she’s my subshine in the rai...,rt parkjwootwts she’s subshine rain 🥺httpstcob...
1,RT @BT21_: Water fight in the rain?!?\nSounds ...,rt bt21 water fight rain\nsounds like kind sum...
2,12時間後\nジミナ家🐥まで出来てた\nリアルにやってるんだ🐰殿下\nそして、シュガヒョンの...,12時間後\nジミナ家🐥まで出来てた\nリアルにやってるんだ🐰殿下\nそして、シュガヒョンの...
3,@Rain_LePoer Mana Ixion\nミストヴィレッジ 22区 (FCハウス) ...,rainlepoer mana ixion\nミストヴィレッ 22 fcハウ 30番地\nですー！
4,Hoy #4julio vamos a optar por un Menú @RockFM_...,hoy 4julio vamos optar por un men rockfmes muy...


In [24]:
features = tfidf_vectorizer.transform(df_features['new_tweets']).toarray()

In [25]:
predictions = predict_from_features(features)

In [26]:
df = complete_df(df_features, predictions)

In [27]:
#sns.countplot(x = 'Prediction', data = df)
print(search)
df.head()

rain


Unnamed: 0,tweets,new_tweets,Prediction
0,RT @parkjwootwts: she’s my subshine in the rai...,rt parkjwootwts she’s subshine rain 🥺httpstcob...,Real
1,RT @BT21_: Water fight in the rain?!?\nSounds ...,rt bt21 water fight rain\nsounds like kind sum...,Fake
2,12時間後\nジミナ家🐥まで出来てた\nリアルにやってるんだ🐰殿下\nそして、シュガヒョンの...,12時間後\nジミナ家🐥まで出来てた\nリアルにやってるんだ🐰殿下\nそして、シュガヒョンの...,Real
3,@Rain_LePoer Mana Ixion\nミストヴィレッジ 22区 (FCハウス) ...,rainlepoer mana ixion\nミストヴィレッ 22 fcハウ 30番地\nですー！,Fake
4,Hoy #4julio vamos a optar por un Menú @RockFM_...,hoy 4julio vamos optar por un men rockfmes muy...,Real
