In [2]:
import torch
import pandas as pd
import torch.nn as nn
import torchvision

In [18]:
df = pd.read_csv('ExtractedTweets.csv', encoding = 'utf-8')

In [19]:
print(df.head(9))

      Party         Handle                                              Tweet
0  Democrat  RepDarrenSoto  Today, Senate Dems vote to #SaveTheInternet. P...
1  Democrat  RepDarrenSoto  RT @WinterHavenSun: Winter Haven resident / Al...
2  Democrat  RepDarrenSoto  RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3  Democrat  RepDarrenSoto  RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4  Democrat  RepDarrenSoto  RT @Vegalteno: Hurricane season starts on June...
5  Democrat  RepDarrenSoto  RT @EmgageActionFL: Thank you to all who came ...
6  Democrat  RepDarrenSoto  Hurricane Maria left approx $90 billion in dam...
7  Democrat  RepDarrenSoto  RT @Tharryry: I am delighted that @RepDarrenSo...
8  Democrat  RepDarrenSoto  RT @HispanicCaucus: Trump's anti-immigrant pol...


In [20]:
df.iloc[0][2]

'Today, Senate Dems vote to #SaveTheInternet. Proud to support similar #NetNeutrality legislation here in the House… https://t.co/n3tggDLU1L'

In [21]:
df1 = df.drop(['Handle'], axis=1)

In [22]:
df1

Unnamed: 0,Party,Tweet
0,Democrat,"Today, Senate Dems vote to #SaveTheInternet. P..."
1,Democrat,RT @WinterHavenSun: Winter Haven resident / Al...
2,Democrat,RT @NBCLatino: .@RepDarrenSoto noted that Hurr...
3,Democrat,RT @NALCABPolicy: Meeting with @RepDarrenSoto ...
4,Democrat,RT @Vegalteno: Hurricane season starts on June...
5,Democrat,RT @EmgageActionFL: Thank you to all who came ...
6,Democrat,Hurricane Maria left approx $90 billion in dam...
7,Democrat,RT @Tharryry: I am delighted that @RepDarrenSo...
8,Democrat,RT @HispanicCaucus: Trump's anti-immigrant pol...
9,Democrat,RT @RepStephMurphy: Great joining @WeAreUnidos...


In [23]:
import unicodedata
import string

print(string.ascii_letters)

abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ


In [24]:
import unicodedata
import string

all_letters = string.ascii_letters + " #@!?.,;'"
n_letters = len(all_letters)

# Turn a Unicode string to plain ASCII, thanks to https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )


In [38]:
tweets = []
for index, row in df1.iterrows():
    tweets.append((str(row[0]) ,str(row[1])))

In [42]:
tweets[60000]

('Republican',
 'Rep. Sensenbrenner finished his 115 in-person #townhall meeting schedule for this year. Photos available at https://t.co/pVCk3a6ICC #WI05')

In [45]:
for twt in tweets:
    twt = (twt[0],unicodeToAscii(twt[1]))

In [46]:
tweets[23909]

('Democrat',
 'Too many local people are struggling to provide housing for their families in Hawaii. Our state is one of the most… https://t.co/qXWmkqvtpq')

In [56]:
# letters encoded as index, and tensors etc. for training purposes
def letterToIndex(a):
    return all_letters.find(a)
    

def letterToTensor(letter):
    i = letterToIndex(letter)
    tensor = torch.zeros(1, n_letters)
    tensor[0][i] = 1
    return tensor

def lineToTensor(line):
    l = len(line)
    tensor = torch.zeros(l,1,n_letters)
    for i in range(l):
        tensor[i][0][letterToIndex(line[i])] = 1
    return tensor

print(letterToTensor(';'))
print(lineToTensor('hiding bla bla bla #this @you'))

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
         0., 0., 0., 0., 0., 1., 0.]])
tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])


In [57]:
n_categories = 2

In [86]:
import torch.autograd as autograd
import torch.nn.functional as F

class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(RNN, self).__init__()
        self.hidden_size = hidden_size
        
        self.i2h = nn.LSTM(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, output_size)
        self.softmax = nn.LogSoftmax(dim=1)
        self.hidden = self.init_hidden()
        
    def init_hidden(self):
        # first is hidden h and second is hidden cell c
        return (autograd.Variable(torch.zeros(1,1,self.hidden_size)),
               autograd.Variable(torch.zeros(1,1,self.hidden_size)))
        
    def forward(self, input):
        lstm_out, self.hidden = self.i2h(input, self.hidden)
        out = self.h2o(lstm_out.view(len(input), -1))
        out = self.softmax(out)
        return out, hidden
        
n_hidden = 64
rnn = RNN(n_letters, n_hidden, n_categories)

In [129]:
input = lineToTensor('Today')
output = rnn(input)

In [130]:
print(output[0][-1])

tensor([-0.7945, -0.6011], grad_fn=<SelectBackward>)


In [131]:
import random

def randomChoice(l):
    return l[random.randint(0, len(l) - 1)]
    # return r, l[r]

def randomTwt():
    twt = randomChoice(tweets)
    category_tensor = torch.tensor([0], dtype=torch.long)
    if twt[0] == 'Democrat':
        category_tensor = torch.tensor([1], dtype=torch.long)
    line_tensor = lineToTensor(twt[1])
    return twt[0], twt[1], category_tensor, line_tensor
    

In [132]:
for i in range(5):
    cat, twt, c_tensor, t_tensor = randomTwt()
    print('cat=',cat, '; tweet=', twt, '; category tensor=', c_tensor, '; tweet tensor=', t_tensor)

cat= Democrat ; tweet= RT @FDNY: In response to Friday’s fatal 2-alarm fire, #FDNY Fire Safety Education Unit is sharing safety info at Rite Aid at 4232 Bay Chest… ; category tensor= tensor([1]) ; tweet tensor= tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 1.]]])
cat= Republican ; tweet= Chairman @DrPhilRoe and Rep. @VernBuchanan released a statement after @DeptVetAffairs announced they will issue vet… https://t.co/huUMgFCjPY ; category tensor= tensor([0]) ; tweet tensor= tensor([[[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[1., 0., 0.,  ..., 0., 0., 0.]],

        ...,

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]],

        [[0., 0., 0.,  ..., 0., 0., 0.]]])
cat= Democrat ; tweet= This is e

In [133]:
criterion = nn.NLLLoss()

learning_rate = 0.009 # If you set this too high, it might explode. If too low, it might not learn

def train(category_tensor, line_tensor):
    rnn.hidden = rnn.init_hidden()

    rnn.zero_grad()

    out, hidden = rnn(line_tensor)
    output = out[-1].unsqueeze(0)
    loss = criterion(output, category_tensor)
    loss.backward()

    # Add parameters' gradients to their values, multiplied by learning rate
    for p in rnn.parameters():
        p.data.add_(-learning_rate, p.grad.data)

    return output, loss.item()

In [134]:
def toCategory(i):
    party = 'Republican'
    if i==1:
        party = 'Democrat'
    return party

def toIndex(s):
    index = 0
    if s == 'Democrat':
        index = 1
    return index

def partyFromOutput(output):
    top_n, top_i = output.topk(1)
    category_i = top_i[0].item()
    return toCategory(category_i), category_i

print(partyFromOutput(output[0][-1]))

('Democrat', 1)


In [135]:
import time
import math

n_iters = 100000
print_every = 10000
#plot_every = 1000

current_loss = 0
all_losses = []

def timeSince(since):
    now = time.time()
    s = now - since
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

start = time.time()

for iter in range(n_iters):
    cat, twt, c_tensor, t_tensor = randomTwt()
    # category, line, category_tensor, line_tensor = randomTrainingExample()
    output, loss = train(c_tensor, t_tensor)
    current_loss += loss

    # Print iter number, loss, name and guess
    if iter % print_every == 0:
        guess, guess_i = partyFromOutput(output)
        correct = '✓' if guess == cat else '✗ (%s)' % cat
        print('%d %d%% (%s) %.4f %s / %s %s' % (iter, iter / n_iters * 100, timeSince(start), loss, twt, guess, correct))
        print(current_loss/print_every)
        current_loss = 0

    #Add current loss avg to list of losses
    #if iter % plot_every == 0:
    #    all_losses.append(current_loss / plot_every)
    #    current_loss = 0

0 0% (0m 0s) 0.6158 We closed out #WomensHistoryMonth with a special event where we announced the winners of our “Women Who Inspire Me”… https://t.co/XKdVZXV6ev / Democrat ✓
6.157895922660827e-05
10000 10% (22m 4s) 0.8056 RT @VCClerkRecorder: Thomas Fire Victims May Need to Re-Register to Vote https://t.co/ur8MJV7AnH https://t.co/ZehqTyFQ6C / Republican ✗ (Democrat)
0.6938330920338631
20000 20% (77m 3s) 0.6942 Happy Mother's Day to all the mothers, grandmothers, aunts and mother-figures. And a special shoutout to my lovely,… https://t.co/8eXfXKMQIv / Republican ✗ (Democrat)
0.6940314987897873


KeyboardInterrupt: 