## The Dependencies

In [4]:
!pip install transformers
!pip install torch torchvision
import pandas as pd
import numpy as np
import nltk
import re
from collections import defaultdict
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import torch
from transformers import BertTokenizer, BertModel
import logging
#logging.basicConfig(level=logging.INFO)
import matplotlib.pyplot as plt
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')



## The Data 

In [5]:
conservative = pd.read_csv(f"news_conservative.csv") #conservative comments from reddit
news = pd.read_csv(f"news_news.csv") #control news comments from reddit
reddit = pd.concat([news, conservative]) # both
test = pd.read_csv(f"test.csv") # test data we collected
test_B = pd.read_pickle("./test.pkl") # test data trained on bert
reddit_B = pd.read_pickle("./reddit.pkl") # reddit data trained on bert


## Clean Data

In [6]:
def binary_codding(label):
  if (label == 'news'):
    return 1
  elif (label == 'conservative'):
    return 0

# cut un-useful data
reddit = reddit[['0','2']]
conservative = conservative[['0','2']]
news = news[['0','2']]

# convert comment type to binary - news = 1 - conservative = 0 
reddit['2'] = reddit['2'].map(binary_codding)
conservative['2'] = conservative['2'].map(binary_codding)
news['2'] = news['2'].map(binary_codding)

# clean out non strings and strigs over 500 char
reddit = reddit[reddit['0'].map(type) == str]
conservative = conservative[conservative['0'].map(type) == str]
news = news[news['0'].map(type) == str]
reddit = reddit[reddit['0'].map(len)  < 500]
conservative = conservative[conservative['0'].map(len)  < 500]
news = news[news['0'].map(len)  < 500]
test = test[test['comment'].map(type) == str]
test = test[test['comment'].map(len)  < 500]


## BOW Implementation
We started with a basic bag of words implimentation of word to vector as a feature in logistic regression

In [7]:
vocab = dict()

def vocabmaker( dataset ):
    
    for doc in dataset:
        if(type(doc) == str):
            tokens = nltk.word_tokenize(doc)
            for word in tokens:
                vocab.update({word : 1})
        

def BOW( dataset ):
    
    column = []
    for doc in dataset:
        if(type(doc) == str):
            bow = defaultdict(float)
            tokens = nltk.word_tokenize(doc)
            for word in tokens:
                if word in vocab:
                    bow[word] += 1
        column.append(bow)
        
    column.append(vocab)
    
    v = DictVectorizer(sparse=False)
    X = v.fit_transform(column)
    return X[0:(X.shape[0]-1), 0: X.shape[1]]
     

In [8]:
X_train, X_test, y_train, y_test = train_test_split(
    reddit['0'], reddit['2'], test_size=0.2, random_state=2020
)



In [9]:
vocab = defaultdict(float)
vocabmaker(X_train)
bowTrain = BOW(X_train)
bowTest = BOW(X_test)

In [11]:
clf = LogisticRegression(penalty='l2',random_state=0, max_iter = 100000)
clf.fit(bowTrain,y_train)

LogisticRegression(max_iter=100000, random_state=0)

In [12]:
print("Accuracy: %0.2f" % clf.score(bowTest, y_test))

Accuracy: 1.00


In [13]:
vocab = defaultdict(float)
vocabmaker(reddit['0'])
bowTrain = BOW(reddit['0'])
bowTest = BOW(test['comment'])

In [14]:
clf2 = LogisticRegression(penalty='l2',random_state=0, max_iter = 100000)
clf2.fit(bowTrain,reddit['2'])

LogisticRegression(max_iter=100000, random_state=0)

In [15]:
print("Accuracy: %0.2f" % clf.score(bowTest, test['label']))

Accuracy: 0.53


In [16]:
X_train_B = []
y_train_B = []
X_test_B = []
y_test_B = []

for x in reddit_B["sentence_embedding"]:  
    if type(x) != torch.Tensor:
        continue
    X_train_B.append(x.numpy())
    


for x in reddit_B['2']:
    y_train_B.append(x)
    

for x in test_B["sentence_embedding"]:
    if type(x) != torch.Tensor:
        continue
    X_test_B.append(x.numpy())
    

for x in test_B['label']:
    y_test_B.append(x)
    


In [17]:
clf3 = LogisticRegression(penalty='l2',random_state=0, max_iter = 100000)
clf3.fit(X_train_B, y_train_B)

LogisticRegression(max_iter=100000, random_state=0)

In [18]:
print("Accuracy: %0.2f" % clf3.score(X_test_B, y_test_B))

Accuracy: 0.62


In [19]:
print(test_B)

                                               comment  label  \
0    How many in the Trump administration has alrea...      1   
1    When you join the Trump team of traitors,you w...      1   
2    So because they didn't charge him with anythin...      1   
3    There were dozens of Trump administration asso...      1   
4    If Page wins,then I'm suing my city.I was foll...      1   
..                                                 ...    ...   
595  How ironic, the very system (capitalism)they d...      0   
596  They are not public servants, they're somethin...      0   
597  Squad members are giving 25% of their earnings...      0   
598  Absolutely great! Now let’s unite against the ...      0   
599  Way to go Dr Miller-Meeks and congratulations....      0   

                                    sentence_embedding  
0    [tensor(0.3914), tensor(-1.1686), tensor(-1.57...  
1    [tensor(2.2838), tensor(-0.9486), tensor(0.670...  
2    [tensor(2.6092), tensor(-0.6065), tensor(-0

In [20]:
X_train, X_test, y_train, y_test = train_test_split(
    test_B['sentence_embedding'], test_B['label'], test_size=0.2, random_state=2020
)

X_train_B = []
y_train_B = []
X_test_B = []
y_test_B = []

for x in X_train:  
    if type(x) != torch.Tensor:
        continue
    X_train_B.append(x.numpy())
    

for x in y_train:
    y_train_B.append(x)
    

for x in X_test:
    if type(x) != torch.Tensor:
        continue
    X_test_B.append(x.numpy())
    

for x in y_test:
    y_test_B.append(x)
    


In [21]:
clf4 = LogisticRegression(penalty='l2',random_state=0, max_iter = 100000)
clf4.fit(X_train_B, y_train_B)
print("Accuracy: %0.2f" % clf4.score(X_test_B, y_test_B))

Accuracy: 0.75
