In [39]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, StratifiedKFold, train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [12]:
### Data cleaning

# Import data, add target classification, merge dataset, add label
fake = pd.read_csv("data/fake.csv", usecols= ["text"])
true = pd.read_csv("data/true.csv", usecols= ["text"])
fake['target'] = 0
true['target'] = 1
data = true.append(fake).sample(frac=1).reset_index().drop(columns=['index'])


In [13]:
data

Unnamed: 0,text,target
0,BERLIN (Reuters) - Foreign Minister Sigmar Gab...,1
1,"Make no mistake, the Muslim European invaders ...",0
2,WASHINGTON (Reuters) - The head of a hard-line...,1
3,Seven Delaware state workers were indicted and...,0
4,North Carolina Republicans just would not list...,0
...,...,...
44893,Rep. Trey Gowdy was full of sarcasm (rightfull...,0
44894,The Senate had better stick to their guns on t...,0
44895,WASHINGTON (Reuters) - President Donald Trump ...,1
44896,BALTIMORE (Reuters) - A Maryland state senator...,1


In [25]:
# Split the data into two parts: training data (7/10) and other data (3/10)
train_text, val_test_text = train_test_split(data, random_state=1234, test_size=0.3, stratify=data['target'])

# Split other data into two parts: validation data (1/3 * 3/10 = 1/10) and testing data (2/3 * 3/10 = 2/10)
val_text, test_text = train_test_split(val_test_text, random_state=1234, test_size=0.6, stratify=val_test_text['target'])

For our baseline model, we will be using the TF-IDF Vectorizer to pre-process articles and then apply Logistic Classifier.

In [26]:
#notes: if param not specified, ngrams = 1. 
text_transformer = TfidfVectorizer(stop_words='english', lowercase=True, max_features=150000)

# vectorize train and test data. Produce TF-IDF for train data

X_train_text = text_transformer.fit_transform(train_text['text'])

X_val_text = text_transformer.transform(val_text['text'])
X_test_text = text_transformer.transform(test_text['text'])

In [31]:
X_train_text.shape

(31428, 105298)

In [34]:
logit = LogisticRegression(penalty = 'l2', C = 1, solver= 'sag', multi_class = 'multinomial')

In [35]:
logit.fit(X_train_text, train_text['target'])

LogisticRegression(C=1, multi_class='multinomial', solver='sag')