In [7]:
import pandas as pd
import re
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_recall_fscore_support
import numpy as np
from multiprocessing import  Pool
import spacy
!python -m spacy download en_core_web_md

Collecting en-core-web-md==3.2.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.2.0/en_core_web_md-3.2.0-py3-none-any.whl (45.7 MB)
     |████████████████████████████████| 45.7 MB 56.7 MB/s            
Collecting typing-extensions<4.0.0.0,>=3.7.4
  Downloading typing_extensions-3.10.0.2-py3-none-any.whl (26 kB)
Installing collected packages: typing-extensions, en-core-web-md
  Attempting uninstall: typing-extensions
    Found existing installation: typing-extensions 4.1.1
    Uninstalling typing-extensions-4.1.1:
      Successfully uninstalled typing-extensions-4.1.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-io 0.21.0 requires tensorflow-io-gcs-filesystem==0.21.0, which is not installed.
explainable-ai-sdk 1.3.2 requires xai-image-widget, which is not installed.
tensorflow 2.6.2 requires numpy~=1

In [8]:
train = pd.read_csv("/kaggle/input/d/yipeng07/fakenews/fulltrain.csv",header=None)
test = pd.read_csv("/kaggle/input/d/yipeng07/fakenews/balancedtest.csv",header=None)

## 1. Obtain the basic features

In [9]:
#Check the first 10 lines
train.head(10)

Unnamed: 0,0,1
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...
5,1,"At a cafeteria-table press conference Monday, ..."
6,1,Stunned shock and dismay were just a few of th...
7,1,"Speaking with reporters before a game Monday, ..."
8,1,Sports journalists and television crews were p...
9,1,"SALEM, VAF;or the eighth straight world-histor..."


In [10]:
#Change columns name
train.columns = ['Verdict','Text']

In [11]:
#Check the first 10 lines
test.head(10)

Unnamed: 0,0,1
0,1,When so many actors seem content to churn out ...
1,1,In what football insiders are calling an unex...
2,1,In a freak accident following Game 3 of the N....
3,1,North Koreas official news agency announced to...
4,1,The former Alaska Governor Sarah Palin would b...
5,1,With the first Presidential debate just two da...
6,1,"There are fans, and then there are super-fans...."
7,1,"With its landmark decisions this week, the Uni..."
8,1,Koch Industries is defending its acquisition o...
9,1,Republican lawmakers asked increasingly tough ...


In [12]:
#Change columns name
test.columns = ['Verdict','Text']

In [13]:
#Word count
train['word_count'] = train['Text'].apply(lambda x: len(str(x).split(" ")))
train[['Text', 'word_count']].head()

Unnamed: 0,Text,word_count
0,"A little less than a decade ago, hockey fans w...",147
1,The writers of the HBO series The Sopranos too...,123
2,Despite claims from the TV news outlet to offe...,706
3,After receiving 'subpar' service and experienc...,706
4,After watching his beloved Seattle Mariners pr...,174


In [14]:
#Char count
train['char_count'] = train['Text'].str.len()
train[['Text','char_count']].head()

Unnamed: 0,Text,char_count
0,"A little less than a decade ago, hockey fans w...",873
1,The writers of the HBO series The Sopranos too...,715
2,Despite claims from the TV news outlet to offe...,4443
3,After receiving 'subpar' service and experienc...,3913
4,After watching his beloved Seattle Mariners pr...,1058


In [15]:
#Average word length

def avg_word(sentence):
    words=sentence.split()
    return (sum(len(word) for word in words)/len(words))

train['avg_word'] = train['Text'].apply(lambda x:avg_word(x))
train[['Text','avg_word']].head()

Unnamed: 0,Text,avg_word
0,"A little less than a decade ago, hockey fans w...",4.979452
1,The writers of the HBO series The Sopranos too...,4.860656
2,Despite claims from the TV news outlet to offe...,5.302128
3,After receiving 'subpar' service and experienc...,4.550355
4,After watching his beloved Seattle Mariners pr...,5.115607


In [16]:
# The number of stop words
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
stop=stopwords.words('english')
train['stopwords']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x in stop]))
train[['Text','stopwords']].head()

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Unnamed: 0,Text,stopwords
0,"A little less than a decade ago, hockey fans w...",46
1,The writers of the HBO series The Sopranos too...,43
2,Despite claims from the TV news outlet to offe...,219
3,After receiving 'subpar' service and experienc...,299
4,After watching his beloved Seattle Mariners pr...,59


In [17]:
#The number of special chars
train['hashtags']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.startswith("#")]))
train[['Text','hashtags']].head()

Unnamed: 0,Text,hashtags
0,"A little less than a decade ago, hockey fans w...",0
1,The writers of the HBO series The Sopranos too...,0
2,Despite claims from the TV news outlet to offe...,0
3,After receiving 'subpar' service and experienc...,0
4,After watching his beloved Seattle Mariners pr...,0


In [18]:
#The number of numerics
train['numerics']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.isdigit()]))
train[['Text','numerics']].head()

Unnamed: 0,Text,numerics
0,"A little less than a decade ago, hockey fans w...",0
1,The writers of the HBO series The Sopranos too...,1
2,Despite claims from the TV news outlet to offe...,20
3,After receiving 'subpar' service and experienc...,5
4,After watching his beloved Seattle Mariners pr...,0


In [19]:
#The number of upper vocab
train['upper']=train['Text'].apply(lambda sen:len([x for x in sen.split() if x.isupper()]))
train[['Text','upper']].head()

Unnamed: 0,Text,upper
0,"A little less than a decade ago, hockey fans w...",4
1,The writers of the HBO series The Sopranos too...,2
2,Despite claims from the TV news outlet to offe...,9
3,After receiving 'subpar' service and experienc...,13
4,After watching his beloved Seattle Mariners pr...,3


In [20]:
train.head()

Unnamed: 0,Verdict,Text,word_count,char_count,avg_word,stopwords,hashtags,numerics,upper
0,1,"A little less than a decade ago, hockey fans w...",147,873,4.979452,46,0,0,4
1,1,The writers of the HBO series The Sopranos too...,123,715,4.860656,43,0,1,2
2,1,Despite claims from the TV news outlet to offe...,706,4443,5.302128,219,0,20,9
3,1,After receiving 'subpar' service and experienc...,706,3913,4.550355,299,0,5,13
4,1,After watching his beloved Seattle Mariners pr...,174,1058,5.115607,59,0,0,3


In [25]:
#Delete HTML
from bs4 import BeautifulSoup
train['Text'] = train['Text'].apply(lambda x: BeautifulSoup(x,'html.parser').get_text())
test['Text'] = test['Text'].apply(lambda x: BeautifulSoup(x,'html.parser').get_text())
test['Text'].head()

#Remove emoji
def remove_emoji(text):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', text)
train['Text']=train['Text'].apply(lambda x: remove_emoji(x))
test['Text']=test['Text'].apply(lambda x: remove_emoji(x))

#Transform to lower letter
train['Text'] = train['Text'].apply(lambda x: x.lower())
test['Text'] = test['Text'].apply(lambda x: x.lower())
test['Text'].head()

#Remove punctuation
import re
train['Text'] = train['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['Text'] = test['Text'].apply(lambda x: re.sub(r'[^\w\s]', '', x))
test['Text'].head()

!pip install inflect

#Substitute number
import inflect
def to_digit(digit):
    i = inflect.engine()
    if digit.isdigit():
        output = i.number_to_words(digit)
    else:
        output = digit
    return output
train['Text'] = train['Text'].apply(lambda x: to_digit(x))
test['Text'] = test['Text'].apply(lambda x: to_digit(x))
test['Text'].head()

#Remove the stopwords
from nltk.corpus import stopwords
stop=stopwords.words('english')
train['Text']=train['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
test['Text']=test['Text'].apply(lambda sen:" ".join(x for x in sen.split() if x not in stop))
test['Text'].head()

#Remove the frequency words
freq=pd.Series(' '.join(train['Text']).split()).value_counts()[:10]
freq=list(freq.index)
train['Text']=train['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))
test['Text']=test['Text'].apply(lambda sen:' '.join(x for x in sen.split() if x not in freq))
test['Text'].head()

# Remove the scarce word
freq = pd.Series(' '.join(train['Text']).split()).value_counts()[-10:]
freq = list(freq.index)
train['Text'] = train['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['Text'] = test['Text'].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
test['Text'].head()

#Noise Removal
def text_cleaner(text):
    rules = [
        {r'>\s+': u'>'},  # remove spaces after a tag opens or closes
        {r'\s+': u' '},  # replace consecutive spaces
        {r'\s*<br\s*/?>\s*': u'\n'},  # newline after a <br>
        {r'</(div)\s*>\s*': u'\n'},  # newline after </p> and </div> and <h1/>...
        {r'</(p|h\d)\s*>\s*': u'\n\n'},  # newline after </p> and </div> and <h1/>...
        {r'<head>.*<\s*(/head|body)[^>]*>': u''},  # remove <head> to </head>
        {r'<a\s+href="([^"]+)"[^>]*>.*</a>': r'\1'},  # show links instead of texts
        {r'[ \t]*<[^<]*?/?>': u''},  # remove remaining tags
        {r'^\s+': u''}  # remove spaces at the beginning
    ]
    for rule in rules:
        for (k, v) in rule.items():
            regex = re.compile(k)
            text = regex.sub(v, text)
        text = text.rstrip()
    return text.lower()

train['Text']=train['Text'].apply(lambda x: text_cleaner(x))

#Lemmatization
from textblob import Word
import nltk
nltk.download('wordnet')
train['Text']=train['Text'].apply( lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))
test['Text']=test['Text'].apply(lambda x:" ".join([Word(word).lemmatize() for word in x.split()]))

Collecting inflect
  Downloading inflect-5.4.0-py3-none-any.whl (33 kB)
Installing collected packages: inflect
Successfully installed inflect-5.4.0
[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


#### TF-IDF

In [26]:
from sklearn.feature_extraction.text import TfidfVectorizer
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=3,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1,
            stop_words = 'english')

# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(train['Text']))
xtrain_tfv =  tfv.transform(train['Text']) 
# xvalid_tfv = tfv.transform(xvalid)
xtest_tfv = tfv.transform(test['Text'])

#### Word2Vec

In [None]:
nlp=spacy.load('en_core_web_md',disable=['parser','ner','tagger'])

def to_vec(x):
    vecs=[]
    for s in x:
        vecs.append(nlp(s).vector)
    return vecs

def preprocess(Texts,n_cores=4):
    '''
    covert texts to vectors by word2vec
    '''
    texts_split = np.array_split(Texts, n_cores)
    pool = Pool(n_cores)
    vecs = pool.map(to_vec, texts_split)
    pool.close()
    pool.join()
    #vecs=np.array(vecs)#.reshape((-1,300))
    vecs1=[]
    for ele in vecs:
        vecs1.extend(ele)
    return np.array(vecs)


word2vec_train=preprocess(train['Text'])
word2vec_test=preprocess(test['Text'])

def trans(data): 
    vecs1=[]
    for ele in data:
        vecs1.extend(ele)
    return np.array(vecs1).reshape(-1,300)
word2vec_train=trans(word2vec_train)
word2vec_test=trans(word2vec_test)

-----------------------

### Validate 2D-feature representation

In [22]:
def convert_label(labels):
    label2coord={1:[1,0],2:[0,0],3:[0.5,0.5],4:[1,1]}
    vecs=labels.apply(lambda x:label2coord[x])
    vecs1=[]
    for ele in vecs:
        vecs1.append(ele)
    return np.array(vecs1,dtype=str)
train_coord=convert_label(train['Verdict'])
test_coord=convert_label(test['Verdict'])

In [27]:
clf0_0=LogisticRegression(C=40,random_state=0,solver='newton-cg')
clf0_0.fit(xtrain_tfv, train_coord[:,0])

clf0_1=LogisticRegression(C=40,random_state=0,solver='newton-cg')
clf0_1.fit(xtrain_tfv, train_coord[:,1])

train_pred0 = clf0_0.predict(xtrain_tfv).astype(float)
train_pred1 = clf0_1.predict(xtrain_tfv).astype(float)

test_pred0 = clf0_0.predict(xtest_tfv).astype(float)
test_pred1 = clf0_1.predict(xtest_tfv).astype(float)

train_pred=np.ones((len(train_pred0),2))
train_pred[:,0]=train_pred0
train_pred[:,1]=train_pred1

test_pred=np.ones((len(test_pred0),2))
test_pred[:,0]=test_pred0
test_pred[:,1]=test_pred1

In [51]:
clf1=LogisticRegression(C=1e3,random_state=0,solver='newton-cg')
clf1.fit(train_pred,train['Verdict'])
predictions=clf1.predict(test_pred) 


precision_recall_fscore_support(test['Verdict'], predictions, average='macro')

(0.7529343003642175, 0.752, 0.7427438080854588, None)

In [46]:
from sklearn.preprocessing import OneHotEncoder
enc=OneHotEncoder()
train_pred_cat=enc.fit_transform(train_pred)
test_pred_cat=enc.fit_transform(test_pred)

clf2=LogisticRegression(C=40,random_state=0,solver='newton-cg')
clf2.fit(train_pred_cat,train['Verdict'])
predictions=clf2.predict(test_pred_cat) #=predict_label(pred0.astype(float),pred1.astype(float))

precision_recall_fscore_support(test['Verdict'], predictions, average='macro')

(0.7613946438565036, 0.75, 0.7379114380127544, None)

-------------------------

## Random Forest

TF_DFT + RF F1=0.62 when n_tree=100 

In [None]:
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(1000,n_jobs=-1)
clf.fit(xtrain_tfv, train['Verdict'])
predictions = clf.predict(xtest_tfv)
precision_recall_fscore_support(test['Verdict'], predictions, average='macro')

In [None]:
clf=RandomForestClassifier(1000,n_jobs=-1)
clf.fit(word2vec_train, train['Verdict'])
predictions = clf.predict(word2vec_test)
precision_recall_fscore_support(test['Verdict'], predictions, average='macro')