In [1]:
import pickle
import pandas as pd
from nltk.corpus import stopwords
stop = stopwords.words('english')
import re
import matplotlib
from matplotlib import pyplot as plt
from nltk.stem.wordnet import WordNetLemmatizer
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import time
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import numpy as np

In [2]:
titleFile = '/scratch/anamikas/hyp/postVac/results/tsvfile_byPublisher.tab'

In [3]:
titleFileDF = pd.read_csv(titleFile, sep="\t", names=['id', 'title', 'hyperpartisan'])

In [4]:
titleFileDF.head()

Unnamed: 0,id,title,hyperpartisan
0,1,After DeVos Announced Plans To Reexamine Title...,True
1,2,University To Award Trayvon Martin With Posthu...,True
2,8,Texas State University suspends Greek life aft...,False
3,12,Jewish Organization's Huge Day Of Unity On Tue...,True
4,15,"BREAKING: Trump Reaches Agreement To Keep 1,00...",True


In [5]:
len(titleFileDF)

599521

In [6]:
def length(text):
    return len(text.split())

In [7]:
titleFileDF['title_new'] = titleFileDF['title'].fillna("")

In [8]:
titleFileDF['length_words'] = titleFileDF['title_new'].apply(length)

In [9]:
titleFileDF.head()

Unnamed: 0,id,title,hyperpartisan,title_new,length_words
0,1,After DeVos Announced Plans To Reexamine Title...,True,After DeVos Announced Plans To Reexamine Title...,20
1,2,University To Award Trayvon Martin With Posthu...,True,University To Award Trayvon Martin With Posthu...,10
2,8,Texas State University suspends Greek life aft...,False,Texas State University suspends Greek life aft...,9
3,12,Jewish Organization's Huge Day Of Unity On Tue...,True,Jewish Organization's Huge Day Of Unity On Tue...,8
4,15,"BREAKING: Trump Reaches Agreement To Keep 1,00...",True,"BREAKING: Trump Reaches Agreement To Keep 1,00...",15


In [10]:
filter = titleFileDF["title_new"] != ""

In [11]:
titleFileDFNew = titleFileDF[filter]

In [12]:
titleFileDFNew.shape

(597848, 5)

In [13]:
titleFileDF['lower_title'] = titleFileDF['title'].str.replace('[^\w\s]','')

In [14]:
titleFileDF['lower_title'] = titleFileDF['lower_title'].fillna("")

In [15]:
titleFileDF['withoutNumbers'] = titleFileDF['lower_title'].str.replace('\d+', '')

In [16]:
titleFileDF['lowerprep'] = titleFileDF['withoutNumbers'].apply(lambda x: " ".join(x.lower() for x in x.split()))

In [17]:
lemma = WordNetLemmatizer()

In [18]:
titleFileDF['lemma'] = titleFileDF['lowerprep'].apply(lambda x: " ".join([lemma.lemmatize(word) for word in x.split()]))

In [19]:
train = pd.DataFrame()

In [20]:
train['hyperpartisan'] = titleFileDF['hyperpartisan']

In [21]:
train['lemmatizedTitle'] = titleFileDF['lemma']

In [22]:
train['lemmatizedLen'] = train['lemmatizedTitle'].apply(length)

In [23]:
TrainTrue = train[train['hyperpartisan'] == True]

In [24]:
TrainTrue.shape

(299758, 3)

In [25]:
TrainFalse = train[train['hyperpartisan'] == False]

In [26]:
    TrainFalse.shape

(299763, 3)

In [30]:
rows = np.random.choice(TrainFalse.index.values, 5)

In [32]:
sampledTrainFalse=TrainFalse.drop(rows)

In [33]:
sampledTrainFalse.shape

(299758, 3)

In [34]:
dataset = pd.concat([TrainTrue, sampledTrainFalse])

In [35]:
shuffleDataset = dataset.sample(frac=1)

In [36]:
trained, tested = train_test_split(shuffleDataset, test_size=0.2)

In [37]:
trained.columns

Index(['hyperpartisan', 'lemmatizedTitle', 'lemmatizedLen'], dtype='object')

In [38]:
trained.shape

(479612, 3)

In [39]:
tested.shape

(119904, 3)

In [40]:
x_train = trained.drop(["hyperpartisan"], axis=1)

In [41]:
y_train = trained["hyperpartisan"]

In [42]:
lbl_enc = preprocessing.LabelEncoder()

In [43]:
y_tr = lbl_enc.fit_transform(y_train)

In [44]:
x_val = tested.drop(["hyperpartisan"], axis=1)

In [45]:
y_val = tested["hyperpartisan"]

In [46]:
y_te = lbl_enc.fit_transform(y_val)

In [47]:
tfv = TfidfVectorizer(min_df=1,  max_features=None, 
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3), use_idf=1,smooth_idf=1,sublinear_tf=1)

In [48]:
start = time.time()
tfv.fit(list(x_train.lemmatizedTitle))
end=time.time()
print(end-start)

32.4683358669281


In [49]:
xtrain_tfv = tfv.transform(x_train.lemmatizedTitle) 

In [50]:
xtrain_tfv.shape

(479612, 3648031)

In [51]:
xvalid_tfv = tfv.transform(x_val.lemmatizedTitle)

In [52]:
xvalid_tfv.shape

(119904, 3648031)

In [53]:
clf = LogisticRegression(C=1.0)

In [54]:
start=time.time()
clf.fit(xtrain_tfv, y_tr)
print(time.time()-start)

14.50297474861145


In [55]:
predictions = clf.predict(xvalid_tfv)

In [56]:
accuracy_score(y_te, predictions)

0.7942020282892981