In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.naive_bayes import GaussianNB 
from sklearn.metrics import precision_recall_fscore_support,accuracy_score


In [14]:
colnames = ['text','event_location','average_tone','article_date','article_ID','article_URL_1','MBFC_factuality_label_1','article_URL','MBFC_factuality_label','URL_to_MBFC_page','source_name','MBFC_notes_about_source','MBFC_bias_label','source_URL','propaganda_label']
train_df = pd.read_csv("../data_proppy/proppy_1.0.train.tsv",sep='\t',names=colnames)[["text",'average_tone','source_name','propaganda_label']]
eval_df = pd.read_csv("../data_proppy/proppy_1.0.dev.tsv",sep='\t',names=colnames)[["text",'average_tone','source_name','propaganda_label']]

train_df['propaganda_label'] = train_df['propaganda_label'].apply(lambda x: int(x==1))
eval_df['propaganda_label'] = eval_df['propaganda_label'].apply(lambda x: int(x==1))

pd.set_option('display.max_colwidth', 150)

train_df 

Unnamed: 0,text,average_tone,source_name,propaganda_label
0,"Et tu, Rhody? A recent editorial in the Providence Journal cataloged everything it could find wrong with Connecticut and ended with this suggesti...",-3.181818,The Hartford Courant,0
1,A recent post in The Farmington Mirror — our town’s version of The Onion — encouraged parents to take advantage of a shuttle service offered by th...,-0.424328,The Hartford Courant,0
2,"President Donald Trump, as he often does while responding to natural disasters, mass shootings or unfolding crises, spent much of his time congrat...",-2.469136,The Hartford Courant,0
3,"February is Black History Month, and nothing looms larger in black history than the evil specter of slavery. Three exhibits in the state take on t...",-0.894632,The Hartford Courant,0
4,"The snow was so heavy, whipped up by gusting winds, that travel was nearly impossible. Thousands abandoned their cars. Ambulances could not pass t...",-4.800000,The Hartford Courant,0
...,...,...,...,...
35981,"From The Telegraph: Towns in Brazil have become refugee camps for a tide of desperate Venezuelans 30 AUGUST 2018 • 6:00AM Johan Rodriguez, a buil...",-3.193277,lewrockwell.com,1
35982,The second episode of Consortium News on Flash Points focuses on two different perspectives on John McCain and the real meaning of Russian interfe...,0.526316,lewrockwell.com,1
35983,"It is beginning. Actually, it’s been happening for a long time – like a slowly metastasizing cancer. The afflicted can no longer hide the underly...",-3.455285,lewrockwell.com,1
35984,"Justin’s note: As regular Dispatch readers know, every Friday we feature no-filtered insights from Doug Casey. You see, Doug isn’t just a world-cl...",-1.052049,lewrockwell.com,1


In [15]:
# maybe balance distribution on train, dev, test sets: 
balance_cats = True 

def balance_set(df_in, label_col,min_label): 
    df_out = df_in[df_in[label_col] == min_label]
    n_to_take = df_out.shape[0]
    for lab in [lab for lab in df_in[label_col].unique() if lab != min_label]:
        df_out = df_out.append(df_in[df_in[label_col] == lab].sample(n=n_to_take))
    return df_out.sample(frac=1) # randomize order 

if balance_cats : 
    train_df = balance_set(train_df,'propaganda_label',1)
    eval_df = balance_set(eval_df,'propaganda_label',1)

In [16]:
counts = train_df[["source_name","propaganda_label","text"]].groupby("propaganda_label").count().rename(columns={"source_name":"Count","text":"frac"})
counts.frac = counts.frac/(counts.Count.sum())
counts

Unnamed: 0_level_0,Count,frac
propaganda_label,Unnamed: 1_level_1,Unnamed: 2_level_1
0,4021,0.5
1,4021,0.5


In [17]:
# get nltk stopwords 
stops = set(stopwords.words('english'))

# verify that we can pipeline these together. 
[word for word in word_tokenize("All work and no play makes jack a dull boy.") if word not in stops]

['All', 'work', 'play', 'makes', 'jack', 'dull', 'boy', '.']

In [18]:
# now vectorize our traain set... 
vectorizer = CountVectorizer(lowercase=True,
							tokenizer=word_tokenize,
							stop_words=stops,
							ngram_range=(1,3),    	# might want to hyperparameter tune this
							max_df=0.9,				# might want to hyperparameter tune this 
							min_df=0.001,				# might want to hyperparameter tune this
							)
X_train = vectorizer.fit_transform(train_df["text"].to_list())
X_eval = vectorizer.transform(eval_df["text"].to_list())
X_train



<8042x72904 sparse matrix of type '<class 'numpy.int64'>'
	with 3624741 stored elements in Compressed Sparse Row format>

In [19]:
# Now create our fun lil naieve bayes model
classifier = GaussianNB()
classifier.fit(X_train.toarray(),train_df['propaganda_label'].to_numpy())

classifier

GaussianNB()

In [20]:
# for uni-bi-gram models
preds = classifier.predict( X_eval.toarray()) 
eval_df["preds"] = preds 
prec,recall,f_1,sup = precision_recall_fscore_support(eval_df.propaganda_label,eval_df.preds)
accuracy = accuracy_score(eval_df.propaganda_label,eval_df.preds)
stats_arr = {"Precision": prec[1], "recall" : recall[1], "f1" : f_1[1],"Accuracy": accuracy}
stats = pd.DataFrame(stats_arr, ['score'])
stats



Unnamed: 0,Precision,recall,f1,Accuracy
score,0.812592,0.965217,0.882353,0.871304


In [21]:
#unigram naieve bayses
vectorizer = CountVectorizer(lowercase=True,
							tokenizer=word_tokenize,
							stop_words=stops,
							ngram_range=(1,1),    	# might want to hyperparameter tune this
							max_df=1.0,				# might want to hyperparameter tune this 
							min_df=0.01,				# might want to hyperparameter tune this
							)
X_train_simp = vectorizer.fit_transform(train_df["text"].to_list())
X_eval_simp = vectorizer.transform(eval_df["text"].to_list())
X_train_simp



<8042x4846 sparse matrix of type '<class 'numpy.int64'>'
	with 1684269 stored elements in Compressed Sparse Row format>

In [22]:
# Now create our fun lil naieve bayes model
classifier = GaussianNB()
classifier.fit(X_train_simp.toarray(),train_df['propaganda_label'].to_numpy())

classifier

GaussianNB()

In [24]:
preds = classifier.predict( X_eval_simp.toarray()) 
eval_df["preds"] = preds 
prec,recall,f_1,sup = precision_recall_fscore_support(eval_df.propaganda_label,eval_df.preds)
accuracy = accuracy_score(eval_df.propaganda_label,eval_df.preds)
stats_arr = {"Precision": prec[1], "recall" : recall[1], "f1" : f_1[1],"Accuracy": accuracy}

train_preds = classifier.predict( X_train_simp.toarray()) 
train_df['preds'] = train_preds
prec,recall,f_1,sup = precision_recall_fscore_support(train_df.propaganda_label,train_df.preds)
accuracy = accuracy_score(train_df.propaganda_label,train_df.preds)
stats_arr_train = {"Precision": prec[1], "recall" : recall[1], "f1" : f_1[1],"Accuracy": accuracy}


stats = pd.DataFrame([stats_arr_train,stats_arr], ['train','test'])
stats.transpose()

Unnamed: 0,train,test
Precision,0.902741,0.837736
recall,0.761751,0.772174
f1,0.826275,0.80362
Accuracy,0.839841,0.811304
