# Adding time-dependent features and keyword features to the existing model

Predicting if there will be an interaction between two leaders has been done in 'Predictive_task_1.ipynb'. 

In this notebook, we will continue using the features we used in 'Predictive_task_1.ipynb' and add time-dependent features to that model. These features are:

- Timestamp of the last date that a particular member interacted with other members of Parliament
- The number of times in each year that these leaders have interacted in the House.

After this model is used, we will also add a feature

- One-hot vector of the top k keywords

In [30]:
import pickle
import numpy as np
from random import shuffle
from bs4 import BeautifulSoup
import requests
import math
from sklearn import svm
from datetime import datetime
import operator
from collections import OrderedDict

In [2]:
f = open('../Results/Pickles/member_names.pickle', 'rb')
member_names = pickle.load(f)
f.close()

In [3]:
f = open('../Results/Pickles/edge_details.pickle', 'rb')
edge_details = pickle.load(f)
f.close()

In [4]:
interactions = {}

for rec in edge_details:
    interactions[rec] = 0
    for debate in edge_details[rec]:
        interactions[rec] += len(edge_details[rec][debate])

In [5]:
individual_interaction = {}

for rec in range(len(member_names)):
    individual_interaction[rec] = 0

for rec in edge_details:
    for debate in edge_details[rec]:
        individual_interaction[rec[0]] += len(edge_details[rec][debate])
        individual_interaction[rec[1]] += len(edge_details[rec][debate])

In [6]:
timestamp = datetime(year = 2009, month = 1, day = 1)
n = len(member_names)

most_recent_timestamp = []

for i in range(n):
    most_recent_timestamp.append(timestamp)

for rec in edge_details:
    for debate in edge_details[rec]:
        for ts in edge_details[rec][debate]:
            most_recent_timestamp[rec[0]] = max(most_recent_timestamp[rec[0]], ts)
            most_recent_timestamp[rec[1]] = max(most_recent_timestamp[rec[1]], ts)

In [7]:
session_dates = set([])

for rec in edge_details:
    for debate in edge_details[rec]:
        for ts in edge_details[rec][debate]:
            session_dates.add(ts)

In [8]:
types_of_debates = set([])
debate_participation = []

for rec in edge_details:
    for debate in edge_details[rec]:
        types_of_debates.add(debate)
    
types_of_debates = list(types_of_debates)
types_of_debates.sort()

for rec in range(len(member_names)):
    debate_participation.append([0 for i in range(len(types_of_debates))])
    
for rec in edge_details:
    for debate in edge_details[rec]:
        debate_participation[rec[0]][types_of_debates.index(debate)] = 1
        debate_participation[rec[1]][types_of_debates.index(debate)] = 1

positive_data = []
negative_data = []

for i in range(n):
    for j in range(i + 1, n):
        if (i, j) in edge_details:
            for debate in types_of_debates:
                year_count = {2009:0, 2010:0, 2011:0, 2012:0, 2013:0, 2014:0}
                for d in edge_details[(i, j)]:
                    for ts in edge_details[(i, j)][d]:
                        year_count[ts.year] += 1
                yearly_interactions = sorted(year_count.items(), key = operator.itemgetter(0))
                yearly_interactions = [x[1] for x in yearly_interactions]
                if debate in edge_details[(i, j)]:
                    one_hot = [0 for i in range(len(types_of_debates))]
                    one_hot[types_of_debates.index(debate)] = 1
                    feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                    positive_data.append(feature_vector)
                else:
                    one_hot = [0 for i in range(len(types_of_debates))]
                    one_hot[types_of_debates.index(debate)] = 1
                    feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                    negative_data.append(feature_vector)
        else:
            year_count = {2009:0, 2010:0, 2011:0, 2012:0, 2013:0, 2014:0}
            yearly_interactions = sorted(year_count.items(), key = operator.itemgetter(0))
            yearly_interactions = [x[1] for x in yearly_interactions]
            for debate in types_of_debates:
                one_hot = [0 for i in range(len(types_of_debates))]
                one_hot[types_of_debates.index(debate)] = 1
                feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                negative_data.append(feature_vector)

In [9]:
indices1 = [i for i in range(len(positive_data))]
indices2 = [i for i in range(len(negative_data))]

shuffle(indices1)
shuffle(indices2)

In [10]:
len(indices2)

4212701

In [11]:
len(indices1)

137029

In [12]:
train_data = []
train_labels = []

for i in range(int(4 * len(indices1) / 5)):
    train_data.append(positive_data[indices1[i]])
    train_labels.append(1)
    
for i in range(int(4 * len(indices1) / 5)):
    train_data.append(negative_data[indices2[i]])
    train_labels.append(0)

In [13]:
theta4, residuals4, rank4, s4 = np.linalg.lstsq(train_data, train_labels)

In [14]:
test_data = []
test_labels = []

for i in range(int(4 * len(indices1) / 5), len(indices1)):
    test_data.append(positive_data[indices1[i]])
    test_labels.append(1)
    

for i in range(int(4 * len(indices1) / 5), len(indices1)):
    test_data.append(negative_data[indices2[i]])
    test_labels.append(0)

In [15]:
count = 0
total = 0

for i in range(len(test_data)):
    val = 0.0
    for j in range(len(theta4)):
        val += theta4[j] * test_data[i][j]
    if val > 0.5:
        val = 1
    else:
        val = 0
    if val == test_labels[i]:
        count += 1
    total += 1

In [16]:
count * 1.0 / len(test_data)

0.8855360140115304

In [22]:
clf4 = svm.LinearSVC(C = 100)
clf4.fit(train_data, train_labels)
pred = clf4.predict(test_data)

In [23]:
count = 0

for i in range(len(pred)):
    #print('Predicted: ' + str(pred[i]) + ', Actual: ' + str(test_labels[i]))
    if pred[i] == test_labels[i]:
        count += 1
        
print(count / len(test_labels))

0.8716339487703423


## Finding the most popular keywords

In [38]:
f = open('../Data/keyword_frequencies.pickle', 'rb')
keywords = pickle.load(f)
f.close()

k = 20  # Number of keywords

topk = []

count = 0
for key in keywords:
    if count == k:
        break
    count += 1
    topk.append(key)

# Adding one-hot vector for keywords

In [40]:
positive_data = []
negative_data = []

for i in range(n):
    for j in range(i + 1, n):
        keyword_onehot = [0 for x in range(k)]
        if (i, j) in edge_details:
            for debate in types_of_debates:
                year_count = {2009:0, 2010:0, 2011:0, 2012:0, 2013:0, 2014:0}
                for d in edge_details[(i, j)]:
                    for ts in edge_details[(i, j)][d]:
                        year_count[ts.year] += 1
                yearly_interactions = sorted(year_count.items(), key = operator.itemgetter(0))
                yearly_interactions = [x[1] for x in yearly_interactions]
                if debate in edge_details[(i, j)]:
                    one_hot = [0 for i in range(len(types_of_debates))]
                    one_hot[types_of_debates.index(debate)] = 1
                    for ts in edge_details[(i, j)][debate]:
                        for kw in edge_details[(i, j)][debate][ts]:
                            if kw in topk:
                                keyword_onehot[topk.index(kw)] = 1
                    feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + keyword_onehot + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                    positive_data.append(feature_vector)
                else:
                    one_hot = [0 for i in range(len(types_of_debates))]
                    one_hot[types_of_debates.index(debate)] = 1
                    feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + keyword_onehot + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                    negative_data.append(feature_vector)
        else:
            year_count = {2009:0, 2010:0, 2011:0, 2012:0, 2013:0, 2014:0}
            yearly_interactions = sorted(year_count.items(), key = operator.itemgetter(0))
            yearly_interactions = [x[1] for x in yearly_interactions]
            for debate in types_of_debates:
                one_hot = [0 for i in range(len(types_of_debates))]
                one_hot[types_of_debates.index(debate)] = 1
                feature_vector = debate_participation[i] + debate_participation[j] + one_hot + [individual_interaction[i], individual_interaction[j]] + yearly_interactions + keyword_onehot + [most_recent_timestamp[i].year, most_recent_timestamp[i].month, most_recent_timestamp[j].year, most_recent_timestamp[j].month, 1]
                negative_data.append(feature_vector)

In [41]:
indices1 = [i for i in range(len(positive_data))]
indices2 = [i for i in range(len(negative_data))]

shuffle(indices1)
shuffle(indices2)

In [42]:
train_data = []
train_labels = []

for i in range(int(4 * len(indices1) / 5)):
    train_data.append(positive_data[indices1[i]])
    train_labels.append(1)
    
for i in range(int(4 * len(indices1) / 5)):
    train_data.append(negative_data[indices2[i]])
    train_labels.append(0)

In [43]:
theta5, residuals5, rank5, s5 = np.linalg.lstsq(train_data, train_labels)

In [44]:
test_data = []
test_labels = []

for i in range(int(4 * len(indices1) / 5), len(indices1)):
    test_data.append(positive_data[indices1[i]])
    test_labels.append(1)
    

for i in range(int(4 * len(indices1) / 5), len(indices1)):
    test_data.append(negative_data[indices2[i]])
    test_labels.append(0)

In [45]:
count = 0
total = 0

for i in range(len(test_data)):
    val = 0.0
    for j in range(len(theta5)):
        val += theta5[j] * test_data[i][j]
    if val > 0.5:
        val = 1
    else:
        val = 0
    if val == test_labels[i]:
        count += 1
    total += 1

In [46]:
count * 1.0 / len(test_data)

0.8975406845216376

In [53]:
clf5 = svm.LinearSVC(C = 50)
clf5.fit(train_data, train_labels)
pred = clf5.predict(test_data)

In [54]:
count = 0

for i in range(len(pred)):
    #print('Predicted: ' + str(pred[i]) + ', Actual: ' + str(test_labels[i]))
    if pred[i] == test_labels[i]:
        count += 1
        
print(count / len(test_labels))

0.8704663212435233
