In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import data

In [2]:
data = pd.read_csv('data/dataset_3/train/train_data.csv')
data.shape

(20630, 4)

In [3]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
data['label'].value_counts()

0    10343
1    10287
Name: label, dtype: int64

In [5]:
data['author'].value_counts().head(10)

Pam Key                243
admin                  193
Jerome Hudson          166
Charlie Spiering       141
John Hayward           140
Katherine Rodriguez    124
Warner Todd Huston     122
Ian Hanchett           119
Breitbart News         118
Daniel Nussbaum        112
Name: author, dtype: int64

### Text Preprocessing

In [6]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
from sklearn.model_selection import train_test_split

In [8]:
# Remove empty (NaN) values with na in dataset
data = data.replace(np.nan, 'na', regex=True)

In [9]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


#### Concatenate feature inputs

In [10]:
frames = [data['title'],data['author'],data['text']]
new_data = pd.Series.to_frame(pd.concat(frames))
new_data['label'] = data['label']
new_data[:1]

Unnamed: 0,0,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1


In [11]:
data['combined_input'] = data['author'].map(str) + data['title'].map(str)+data['text'].map(str)

In [12]:
# Split data into train and test datasets
# We have around 20630 entries, of which 20% will be used for testing and the rest will be used for training and validataion
# We will be using k-flod cross validation, therefore we do not need to split train data further into two parts as our k-flod CV
# will do that for us
seed = 9
np.random.seed(seed)
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = seed)

In [13]:
train_data.shape, test_data.shape

((16504, 5), (4126, 5))

In [14]:
# This will be random every time we split our data
train_data[:5]

Unnamed: 0,title,author,text,label,combined_input
11762,The French Fear Islamization but Do Nothing,Guillaume Durocher,"Posted on November 4, 2016 The French Fear Isl...",1,Guillaume DurocherThe French Fear Islamization...
2179,What We Just Witnessed Has Rarely Occurred In ...,King World News,32 King World News \nOn the heels of yesterd...,1,King World NewsWhat We Just Witnessed Has Rare...
11951,Flight Attendants Fight Human Trafficking With...,Jacey Fortin,"Donna Hubbard, a flight attendant who lives ou...",0,Jacey FortinFlight Attendants Fight Human Traf...
19925,"Rockefeller Foundation Picks Rajiv J. Shah, a ...",David Gelles,"Rajiv J. Shah, a trustee of the Rockefeller Fo...",0,David GellesRockefeller Foundation Picks Rajiv...
10052,Carl Bernstein: Hillary Scandals Not In The “S...,James Fulford,,1,James FulfordCarl Bernstein: Hillary Scandals ...


In [15]:
test_data[:5]

Unnamed: 0,title,author,text,label,combined_input
2277,Fifth Mexican Journalist Murdered in 90 Days,Ildefonso Ortiz and Brandon Darby,Suspected cartel gunmen killed another journ...,0,Ildefonso Ortiz and Brandon DarbyFifth Mexican...
8066,"Suburban Chicago School Teaches ’Blackenomics,...",Warner Todd Huston,A suburban Chicago high school is taking the O...,0,Warner Todd HustonSuburban Chicago School Teac...
16081,John Podesta’s Sister-in-Law Lobbied For Rayth...,Michael Krieger,at 11:08 am 1 Comment \nThe Podesta family see...,1,Michael KriegerJohn Podesta’s Sister-in-Law Lo...
16560,"Review: Graham, Cunningham and Taylor, All Tog...",Alastair Macaulay,"The triple bill of dances by Martha Graham, Me...",0,"Alastair MacaulayReview: Graham, Cunningham an..."
5565,"Ashton Kutcher Rescues 6,000 Sex Trafficking V...",Amando Flavio,Christopher Ashton Kutcher is a well-known fig...,1,"Amando FlavioAshton Kutcher Rescues 6,000 Sex ..."


#### Train features and targets

In [16]:
X_train, y_train = train_data.iloc[:,4].values, train_data.iloc[:,3].values

In [17]:
X_train[:1]

      dtype=object)

In [18]:
y_train[:1]

array([1], dtype=int64)

#### Test features and targets

In [19]:
X_test, y_test = test_data.iloc[:,4].values, test_data.iloc[:,3].values

In [20]:
X_test[:1]

array(['Ildefonso Ortiz and Brandon DarbyFifth Mexican Journalist Murdered in 90 DaysSuspected cartel gunmen killed another   journalist. This year, reporters exposing drug cartels and their ties to Mexican politicians have become targets with five murders taking place in 2017. [Mexico’s Rio Doce confirmed the murder of its founder, Javier Valdez, an   investigator and author who had been reporting on Mexico’s organized crime. Valdez was driving a red Toyota Corolla along a city street in Culiacan, Sinaloa, when unidentified gunmen shot him, Rio Doce reported. The local print weekly and online publication is one of the few news outlets that continues to carry out   investigations in Mexico exposing the deep ties between Mexican politicians and drug cartels.   Valdez’s murder comes just weeks after cartel gunmen murdered respected journalist Maximino Rodriguez Palacios in Baja California Sur as he drove with his wife to a shopping center, Breitbart Texas reported. The murder remains uns

In [21]:
y_test[:1]

array([0], dtype=int64)

In [22]:
import re
# from bs4 import BeautifulSoup

In [23]:
"""
REMOVE_STOPWORDS truncates stopwrds from the string and returns modified string
    INPUT:
        >> string
    OUTPUT:
        >> Modified string without stopwords
"""
def remove_stopwords(text):
    text = [word for word in text.split() if word not in STOPWORDS]
    text = ' '.join(text)
    return text

In [24]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
     # convert all characters in a string to lowercase
    text = text.lower()
    
    #replace Symbols with a space in string
    text = re.sub(REPLACE_BY_SPACE_RE, " ",text)
    
    # delete unwanted synbols from string
    text = re.sub(BAD_SYMBOLS_RE,"", text)
    
    # delete stopwords from text
    text = remove_stopwords(text)
    
    return text

In [25]:
X_train = [text_prepare(x) for x in X_train]

In [26]:
X_test = [text_prepare(x) for x in X_test]

In [27]:
X_train[:1]



In [28]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Import data

In [29]:
data = pd.read_csv('data/dataset_3/train/train_data.csv')
data.shape

(20630, 4)

In [30]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [31]:
data['label'].value_counts()

0    10343
1    10287
Name: label, dtype: int64

In [32]:
data['author'].value_counts().head(10)

Pam Key                243
admin                  193
Jerome Hudson          166
Charlie Spiering       141
John Hayward           140
Katherine Rodriguez    124
Warner Todd Huston     122
Ian Hanchett           119
Breitbart News         118
Daniel Nussbaum        112
Name: author, dtype: int64

### Text Preprocessing

In [33]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\abhi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [34]:
from sklearn.model_selection import train_test_split

In [35]:
# Remove empty (NaN) values with na in dataset
data = data.replace(np.nan, 'na', regex=True)

In [36]:
data.head(5)

Unnamed: 0,title,author,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


#### Concatenate feature inputs

In [37]:
frames = [data['title'],data['author'],data['text']]
new_data = pd.Series.to_frame(pd.concat(frames))
new_data['label'] = data['label']
new_data[:1]

Unnamed: 0,0,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1


In [38]:
data['combined_input'] = data['author'].map(str) + data['title'].map(str)+data['text'].map(str)

In [39]:
# Split data into train and test datasets
# We have around 20630 entries, of which 20% will be used for testing and the rest will be used for training and validataion
# We will be using k-flod cross validation, therefore we do not need to split train data further into two parts as our k-flod CV
# will do that for us
seed = 9
np.random.seed(seed)
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = seed)

In [40]:
train_data.shape, test_data.shape

((16504, 5), (4126, 5))

In [41]:
# This will be random every time we split our data
train_data[:5]

Unnamed: 0,title,author,text,label,combined_input
11762,The French Fear Islamization but Do Nothing,Guillaume Durocher,"Posted on November 4, 2016 The French Fear Isl...",1,Guillaume DurocherThe French Fear Islamization...
2179,What We Just Witnessed Has Rarely Occurred In ...,King World News,32 King World News \nOn the heels of yesterd...,1,King World NewsWhat We Just Witnessed Has Rare...
11951,Flight Attendants Fight Human Trafficking With...,Jacey Fortin,"Donna Hubbard, a flight attendant who lives ou...",0,Jacey FortinFlight Attendants Fight Human Traf...
19925,"Rockefeller Foundation Picks Rajiv J. Shah, a ...",David Gelles,"Rajiv J. Shah, a trustee of the Rockefeller Fo...",0,David GellesRockefeller Foundation Picks Rajiv...
10052,Carl Bernstein: Hillary Scandals Not In The “S...,James Fulford,,1,James FulfordCarl Bernstein: Hillary Scandals ...


In [42]:
test_data[:5]

Unnamed: 0,title,author,text,label,combined_input
2277,Fifth Mexican Journalist Murdered in 90 Days,Ildefonso Ortiz and Brandon Darby,Suspected cartel gunmen killed another journ...,0,Ildefonso Ortiz and Brandon DarbyFifth Mexican...
8066,"Suburban Chicago School Teaches ’Blackenomics,...",Warner Todd Huston,A suburban Chicago high school is taking the O...,0,Warner Todd HustonSuburban Chicago School Teac...
16081,John Podesta’s Sister-in-Law Lobbied For Rayth...,Michael Krieger,at 11:08 am 1 Comment \nThe Podesta family see...,1,Michael KriegerJohn Podesta’s Sister-in-Law Lo...
16560,"Review: Graham, Cunningham and Taylor, All Tog...",Alastair Macaulay,"The triple bill of dances by Martha Graham, Me...",0,"Alastair MacaulayReview: Graham, Cunningham an..."
5565,"Ashton Kutcher Rescues 6,000 Sex Trafficking V...",Amando Flavio,Christopher Ashton Kutcher is a well-known fig...,1,"Amando FlavioAshton Kutcher Rescues 6,000 Sex ..."


#### Train features and targets

In [43]:
X_train, y_train = train_data.iloc[:,4].values, train_data.iloc[:,3].values

In [44]:
X_train[:1]

      dtype=object)

In [45]:
y_train[:1]

array([1], dtype=int64)

#### Test features and targets

In [46]:
X_test, y_test = test_data.iloc[:,4].values, test_data.iloc[:,3].values

In [47]:
X_test[:1]

array(['Ildefonso Ortiz and Brandon DarbyFifth Mexican Journalist Murdered in 90 DaysSuspected cartel gunmen killed another   journalist. This year, reporters exposing drug cartels and their ties to Mexican politicians have become targets with five murders taking place in 2017. [Mexico’s Rio Doce confirmed the murder of its founder, Javier Valdez, an   investigator and author who had been reporting on Mexico’s organized crime. Valdez was driving a red Toyota Corolla along a city street in Culiacan, Sinaloa, when unidentified gunmen shot him, Rio Doce reported. The local print weekly and online publication is one of the few news outlets that continues to carry out   investigations in Mexico exposing the deep ties between Mexican politicians and drug cartels.   Valdez’s murder comes just weeks after cartel gunmen murdered respected journalist Maximino Rodriguez Palacios in Baja California Sur as he drove with his wife to a shopping center, Breitbart Texas reported. The murder remains uns

In [48]:
y_test[:1]

array([0], dtype=int64)

In [49]:
import re
# from bs4 import BeautifulSoup

In [50]:
"""
REMOVE_STOPWORDS truncates stopwrds from the string and returns modified string
    INPUT:
        >> string
    OUTPUT:
        >> Modified string without stopwords
"""
def remove_stopwords(text):
    text = [word for word in text.split() if word not in STOPWORDS]
    text = ' '.join(text)
    return text

In [51]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def text_prepare(text):
    """
        text: a string
        
        return: modified initial string
    """
     # convert all characters in a string to lowercase
    text = text.lower()
    
    #replace Symbols with a space in string
    text = re.sub(REPLACE_BY_SPACE_RE, " ",text)
    
    # delete unwanted synbols from string
    text = re.sub(BAD_SYMBOLS_RE,"", text)
    
    # delete stopwords from text
    text = remove_stopwords(text)
    
    return text

In [52]:
X_train = [text_prepare(x) for x in X_train]

In [53]:
X_test = [text_prepare(x) for x in X_test]

In [54]:
X_train[:1]



### Random Forest

#### Training

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

In [57]:
vectorizer = CountVectorizer(analyzer='word',tokenizer=None, preprocessor=None, stop_words=None, max_features=5000)

In [58]:
X_train_features = vectorizer.fit_transform(X_train)
X_train_features = X_train_features.toarray()

In [59]:
X_train_features.shape

(16504, 5000)

In [60]:
vocab = vectorizer.get_feature_names
vocab

<bound method CountVectorizer.get_feature_names of CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=5000, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)>

In [61]:
from sklearn.ensemble import RandomForestClassifier

In [62]:
forest = RandomForestClassifier(n_estimators=100)

In [63]:
# Fit the forest to the training set, using the bag of words as 
# features and the labels as the response variable

forest = forest.fit( X_train_features, y_train )

#### Testing

In [64]:
X_test_features = vectorizer.fit_transform(X_test)
X_test_features = X_test_features.toarray()

In [65]:
# Prediction
result = forest.predict(X_test_features)

In [81]:
score = forest.score(X_test_features, y_test)
score

0.5942801745031507

In [74]:
# Copy the results to a pandas dataframe with an "id" column and
# a "sentiment" column
output = pd.DataFrame( data={"title":test_data["title"], "label_target":test_data["label"],"label_pred":result} )

In [76]:
output.head(5)

Unnamed: 0,title,label_target,label_pred
2277,Fifth Mexican Journalist Murdered in 90 Days,0,1
8066,"Suburban Chicago School Teaches ’Blackenomics,...",0,1
16081,John Podesta’s Sister-in-Law Lobbied For Rayth...,1,1
16560,"Review: Graham, Cunningham and Taylor, All Tog...",0,1
5565,"Ashton Kutcher Rescues 6,000 Sex Trafficking V...",1,1
