# Create Rule DataFrame

In [1]:
import pandas as pd
import re

rules_df = pd.read_csv('N-Grams Data.csv')

#removes previous index column and frequency of n-grams
rules_df = rules_df.drop(rules_df.columns[[0, 3]], axis = 1)

In [2]:
rules_df

Unnamed: 0,Terms,Exceptions
0,the minimum wage,
1,a great job,
2,has done a,
3,go back to,
4,a lot of,
5,vote for him,
6,vote him out,
7,he has done,
8,needs to go,
9,one of the,


# Train N-Grams on Labeled Data

In [3]:
users_df = pd.read_csv('Labeled.csv')
users_df = users_df.drop(users_df.columns[0], axis = 1)

#standardizes messages -- lowercase, removes whitespace characters and symbols
users_df['Message'] = users_df['Message'].apply(lambda x: x.lower())
users_df['Message'] = users_df['Message'].apply(lambda x: x.replace('nan', ''))
users_df['Message'] = users_df['Message'].apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x))

In [4]:
users_df

Unnamed: 0,Name,Message,Party_Affiliation
0,A Helmut Fickenwirth,gop god s own party,0
1,AJ Dupuis,ahhhh i think you idiots have it backwards jus...,0
2,Aaron Jackson,considering the democrats started the kkk and ...,0
3,Aaron Krukowski,live free vote blue veto sununu veto sununu,1
4,Aaron Robert,trump is your president 12 more years,0
5,Ace Knowles,great job governor sununu you have our support...,0
6,Adam Fontaine,thats because the rest of them want us to be t...,0
7,Adam Jache,pretty sure the other things tied to that bill...,0
8,Adrienne Fran,joe biden and kamala harris will bring this co...,1
9,Adrienne Spear,sununu leads from behind he waits as long as h...,1


In [5]:
#counts frequency of republicans and democrats using each n-gram
def freq(x, party_aff):
    rule = x['Terms']
    sum_freq = 0
    
    found_rules = list(users_df[users_df['Message'].str.contains(rule)].index)

    #iterates through all indices of entries containing n-gram
    for i in found_rules:
        message = users_df['Message'][i]
        user_aff = users_df['Party_Affiliation'][i]
        tot_rule = message.count(rule)
        tot = 0

        if party_aff == user_aff:
            tot = tot_rule
                
        sum_freq += tot

        
    return sum_freq    

In [6]:
rules_df['Republican Usage'] = rules_df.apply(freq, args = (0,), axis = 1)
rules_df['Democrat Usage'] = rules_df.apply(freq, args = (1,), axis = 1)

In [7]:
rules_df

Unnamed: 0,Terms,Exceptions,Republican Usage,Democrat Usage
0,the minimum wage,,12,10
1,a great job,,15,4
2,has done a,,18,4
3,go back to,,18,12
4,a lot of,,5,8
5,vote for him,,6,10
6,vote him out,,1,24
7,he has done,,8,4
8,needs to go,,4,7
9,one of the,,20,2


In [8]:
#determines total times n-gram was used in dataset
def tot_usage(x):
    rep = x['Republican Usage']
    dem = x['Democrat Usage']
    
    return rep + dem

In [9]:
rules_df['Total Usage'] = rules_df.apply(tot_usage, axis = 1)

In [10]:
rules_df

Unnamed: 0,Terms,Exceptions,Republican Usage,Democrat Usage,Total Usage
0,the minimum wage,,12,10,22
1,a great job,,15,4,19
2,has done a,,18,4,22
3,go back to,,18,12,30
4,a lot of,,5,8,13
5,vote for him,,6,10,16
6,vote him out,,1,24,25
7,he has done,,8,4,12
8,needs to go,,4,7,11
9,one of the,,20,2,22


# N-Gram Analysis

In [11]:
#determines n-gram's affiliation
def aff(x):
    rep = x['Republican Usage']
    dem = x['Democrat Usage']
    
    if rep > dem:
        return 0
    elif dem > rep:
        return 1
    else:
        return None
    

In [12]:
rules_df['Affiliation'] = rules_df.apply(aff, axis = 1)

In [13]:
#calculates n-gram's accuracy as a percentage
def percentage(x):
    rep = x['Republican Usage']
    dem = x['Democrat Usage']
    tot = x['Total Usage']    

    if rep > dem:
        percentage = (rep / tot) * 100
        
    elif dem > rep:
        percentage = (dem / tot) * 100

    elif rep == dem & rep != 0:
        percentage = 50    
    else:
        percentage = None

        
    if percentage != None:
        percentage = float('{0:.2f}'.format(percentage))
        
    
    return percentage


In [14]:
rules_df['Accuracy %'] = rules_df.apply(percentage, axis=1)

In [15]:
rules_df = rules_df.sort_values(['Total Usage'], ascending = False)

In [16]:
rules_df

Unnamed: 0,Terms,Exceptions,Republican Usage,Democrat Usage,Total Usage,Affiliation,Accuracy %
3,go back to,,18,12,30,0.0,60.00
6,vote him out,,1,24,25,1.0,96.00
0,the minimum wage,,12,10,22,0.0,54.55
13,you want to,,13,9,22,0.0,59.09
2,has done a,,18,4,22,0.0,81.82
22,if you want,,12,10,22,0.0,54.55
9,one of the,,20,2,22,0.0,90.91
52,live free or die,,16,5,21,0.0,76.19
39,free or die,,16,5,21,0.0,76.19
29,we need to,,8,13,21,1.0,61.90


# Export N-Grams Analysis Data

In [17]:
rules_df.to_csv('N-Gram Analysis.csv')