In [2]:
import re
import nltk
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize 
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ahmadkhan242/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /home/ahmadkhan242/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [3]:
# Loading Dataset
df = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("testWithLabel.csv")

In [4]:
# Concatinating training and validation
df = pd.concat([df, val])

In [5]:
# General Pre-processing 
def remove_links(tweet):
    '''Takes a string and removes web links from it'''
    tweet = re.sub(r'http\S+', '', tweet) # remove http links
    tweet = re.sub(r'bit.ly/\S+', '', tweet) # rempve bitly links
    tweet = tweet.strip('[link]') # remove [links]
    return tweet

def remove_users(tweet):
    '''Takes a string and removes retweet and @user information'''
    tweet = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove retweet
    tweet = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', tweet) # remove tweeted at
    return tweet

my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'
def preprocess(sent):
    sent = remove_users(sent)
    sent = remove_links(sent)
    sent = sent.lower() # lower case
    sent = re.sub('['+my_punctuation + ']+', ' ', sent) # strip punctuation
    sent = re.sub('\s+', ' ', sent) #remove double spacing
    sent = re.sub('([0-9]+)', '', sent) # remove numbers
    sent_token_list = [word for word in sent.split(' ')]
    sent = ' '.join(sent_token_list)
    return sent

### Without feature engineering

In [6]:
df['tweet']=df['tweet'].apply(lambda x: preprocess(x))
test['tweet']=test['tweet'].apply(lambda x: preprocess(x))

In [8]:
# Using TF-IDF
vectorizer = TfidfVectorizer()
train_final_features = vectorizer.fit_transform(df['tweet']).toarray()
test_final_features = vectorizer.transform(test['tweet']).toarray()
print(train_final_features.shape)
print(test_final_features.shape)

(8560, 13158)
(2140, 13158)


In [20]:
train_tf_idf= pd.DataFrame(train_final_features)
test_tf_idf= pd.DataFrame(test_final_features)

In [21]:
train_label = df['label']
test_label = test['label']

In [22]:
X_train, X_test, y_train, y_test = train_test_split(train_tf_idf, train_label ,test_size=0.2)

In [24]:
ytest = np.array(test_label)

In [None]:
# Training
X_train, X_test, y_train, y_test = train_test_split(train_tf_idf, train_label ,test_size=0.2)
_RandomForestClassifier = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42)
_RandomForestClassifier.fit(X_train, y_train)
_RandomForestClassifier_prediction = _RandomForestClassifier.predict(X_test)
val_RandomForestClassifier_prediction = _RandomForestClassifier.predict(test_tf_idf)
print("Accuracy => ", round(accuracy_score(_RandomForestClassifier_prediction, y_test)*100, 2))
print("\nRandom Forest Classifier results: \n")
print(classification_report(y_test, _RandomForestClassifier_prediction, target_names = ['real', 'fake']))
print("Validation Accuracy => ", round(accuracy_score(val_RandomForestClassifier_prediction, ytest)*100, 2))
print("\nValidation Random Forest Classifier results: \n")
print(classification_report(ytest, val_RandomForestClassifier_prediction, target_names = ['real', 'fake']))

### With feature engineering

In [None]:
# count number of characters 
def count_chars(text):
    return len(text)

# count number of words 
def count_words(text):
    return len(text.split())

# count number of capital characters
def count_capital_chars(text):
    count=0
    for i in text:
        if i.isupper():
            count+=1
    return count

# count number of capital words
def count_capital_words(text):
    return sum(map(str.isupper,text.split()))

# count number of punctuations
def count_punctuations(text):
    punctuations='!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'
    d=dict()
    for i in punctuations:
        d[str(i)+' count']=text.count(i)
    return d

# count number of words in quotes
def count_words_in_quotes(text):
    x = re.findall("\'.\'|\".\"", text)
    count=0
    if x is None:
        return 0
    else:
        for i in x:
            t=i[1:-1]
            count+=count_words(t)
        return count
    
# count number of sentences
def count_sent(text):
    return len(nltk.sent_tokenize(text))

# calculate average word length
def avg_word_len(char_cnt,word_cnt):
    return char_cnt/word_cnt

# calculate average sentence length
def avg_sent_len(word_cnt,sent_cnt):
    return word_cnt/sent_cnt

# count number of unique words 
def count_unique_words(text):
    return len(set(text.split()))
            
# words vs unique feature
def words_vs_unique(words,unique):
    return unique/words
    
# count of hashtags
def count_htags(text):
    x = re.findall(r'(\#\w[A-Za-z0-9]*)', text)
    return len(x)

# count of mentions
def count_mentions(text):
    x = re.findall(r'(\@\w[A-Za-z0-9]*)', text)
    return len(x)

# count of stopwords
def count_stopwords(text):
    stop_words = set(stopwords.words('english'))  
    word_tokens = word_tokenize(text)
    stopwords_x = [w for w in word_tokens if w in stop_words]
    return len(stopwords_x)

# stopwords vs words
def stopwords_vs_words(stopwords_cnt,text):
    return stopwords_cnt/len(word_tokenize(text))

In [None]:
train = pd.read_csv("train.csv")
val = pd.read_csv("validation.csv")
test = pd.read_csv("testWithLabel.csv")

In [None]:
df = pd.concat([train, val])

In [None]:
df['char_count'] = df["tweet"].apply(lambda x:count_chars(x))
df['word_count'] = df["tweet"].apply(lambda x:count_words(x))
df['sent_count'] = df["tweet"].apply(lambda x:count_sent(x))
df['capital_char_count'] = df["tweet"].apply(lambda x:count_capital_chars(x))
df['capital_word_count'] = df["tweet"].apply(lambda x:count_capital_words(x))
df['quoted_word_count'] = df["tweet"].apply(lambda x:count_words_in_quotes(x))
df['stopword_count'] = df["tweet"].apply(lambda x:count_stopwords(x))
df['unique_word_count'] = df["tweet"].apply(lambda x:count_unique_words(x))
df['htag_count'] = df["tweet"].apply(lambda x:count_htags(x))
df['mention_count'] = df["tweet"].apply(lambda x:count_mentions(x))
df['punct_count'] = df["tweet"].apply(lambda x:count_punctuations(x))
df['avg_wordlength']=df['char_count']/df['word_count']
df['avg_sentlength']=df['word_count']/df['sent_count']
df['unique_vs_words']=df['unique_word_count']/df['word_count']
df['stopwords_vs_words']=df['stopword_count']/df['word_count']

# SIMILARLY YOU CAN APPLY THEM ON TEST SET
test['char_count'] = test["tweet"].apply(lambda x:count_chars(x))
test['word_count'] = test["tweet"].apply(lambda x:count_words(x))
test['sent_count'] = test["tweet"].apply(lambda x:count_sent(x))
test['capital_char_count'] = test["tweet"].apply(lambda x:count_capital_chars(x))
test['capital_word_count'] = test["tweet"].apply(lambda x:count_capital_words(x))
test['quoted_word_count'] = test["tweet"].apply(lambda x:count_words_in_quotes(x))
test['stopword_count'] = test["tweet"].apply(lambda x:count_stopwords(x))
test['unique_word_count'] = test["tweet"].apply(lambda x:count_unique_words(x))
test['htag_count'] = test["tweet"].apply(lambda x:count_htags(x))
test['mention_count'] = test["tweet"].apply(lambda x:count_mentions(x))
test['punct_count'] = test["tweet"].apply(lambda x:count_punctuations(x))
test['avg_wordlength']=test['char_count']/test['word_count']
test['avg_sentlength']=test['word_count']/test['sent_count']
test['unique_vs_words']=test['unique_word_count']/test['word_count']
test['stopwords_vs_words']=test['stopword_count']/test['word_count']

In [None]:
df_punct= pd.DataFrame(list(df.punct_count))
test_punct= pd.DataFrame(list(test.punct_count))

# Mearning pnctuation DataFrame with main DataFrame
df=pd.merge(df,df_punct,left_index=True, right_index=True)
test=pd.merge(test,test_punct,left_index=True, right_index=True)

# We can drop "punct_count" column from both df and test DataFrame
df.drop(columns=['punct_count'],inplace=True)
test.drop(columns=['punct_count'],inplace=True)

In [None]:
df['tweet']=df['tweet'].apply(lambda x: preprocess(x))
test['tweet']=test['tweet'].apply(lambda x: preprocess(x))

In [None]:
df.drop(columns=['punct_count'],inplace=True)
test.drop(columns=['punct_count'],inplace=True)

In [None]:
df['tweet'] = df['tweet'].astype('string')
test['tweet'] = test['tweet'].astype('string')
df['tweet'].fillna('', inplace = True)
test['tweet'].fillna('', inplace = True)

In [None]:
vectorizer            =  TfidfVectorizer()
train_tf_idf_features =  vectorizer.fit_transform(df['tweet']).toarray()
test_tf_idf_features  =  vectorizer.transform(test['tweet']).toarray()
# Converting above list to DataFrame
train_tf_idf          = pd.DataFrame(train_tf_idf_features)
test_tf_idf           = pd.DataFrame(test_tf_idf_features)
# Saparating train and test labels from all features
train_Y               = df['label']
test_Y                = test['label']
# Listing all features
features = ['char_count', 'word_count', 'sent_count',
       'capital_char_count', 'capital_word_count', 'quoted_word_count',
       'stopword_count', 'unique_word_count', 'htag_count', 'mention_count',
       'avg_wordlength', 'avg_sentlength', 'unique_vs_words',
       'stopwords_vs_words', '! count', '" count', '# count', '$ count',
       '% count', '& count', '\' count', '( count', ') count', '* count',
       '+ count', ', count', '- count', '. count', '/ count', ': count',
       '; count', '< count', '= count', '> count', '? count', '@ count',
       '[ count', '\ count', '] count', '^ count', '_ count', '` count',
       '{ count', '| count', '} count', '~ count']
# Finally merging all features with above TF-IDF. 
train = pd.merge(train_tf_idf,df[features],left_index=True, right_index=True)
test  = pd.merge(test_tf_idf,test[features],left_index=True, right_index=True)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train, train_Y, test_size=0.2, random_state = 42)# Random Forest Classifier
_RandomForestClassifier = RandomForestClassifier(n_estimators = 1000, min_samples_split = 15, random_state = 42)
_RandomForestClassifier.fit(X_train, y_train)
_RandomForestClassifier_prediction = _RandomForestClassifier.predict(X_test)
val_RandomForestClassifier_prediction = _RandomForestClassifier.predict(test)
print("Accuracy => ", round(accuracy_score(_RandomForestClassifier_prediction, y_test)*100, 2))
print("\nRandom Forest Classifier results: \n")
print(classification_report(y_test, _RandomForestClassifier_prediction, target_names = ['real', 'fake']))
print("Validation Accuracy => ", round(accuracy_score(val_RandomForestClassifier_prediction, test_Y)*100, 2))
print("\nValidation Random Forest Classifier results: \n")
print(classification_report(test_Y, val_RandomForestClassifier_prediction, target_names = ['real', 'fake']))