# Introduction to Data Science, Lab 1 (9/16)
- Python fundamentals recap (with NumPy, Pandas)
- Modelling with Google Trends data

## Python Essentials
#### *Formatting strings*

In [1]:
# Standard:
print('Hello world from room 650!\nBye.')

Hello world from room 650!
Bye.


In [2]:
# String with argument(s):
room=650
print('Hello world from room {}!'.format(room))
print(f'Hello world from room {room}!')

Hello world from room 650!
Hello world from room 650!


In [3]:
# Multiple arguments:
import math

one_third=1/3
print('1/3 equals {}, pi equals {}'.format(one_third,math.pi))

1/3 equals 0.3333333333333333, pi equals 3.141592653589793


In [4]:
# Format the argument:
one_third=1/3
print('1/3 equals {}'.format(one_third)) # insert 'one_third' in place of '{}'
print('1/3 equals {:.3f}'.format(one_third)) # specify the number of decimals 
print('1/3 equals {}'.format(round(one_third,3))) # apply the 'round' function directly
print(f'1/3 equals {one_third:.3f}')

1/3 equals 0.3333333333333333
1/3 equals 0.333
1/3 equals 0.333
1/3 equals 0.333


In [5]:
# Choose the order of arguments:
print('1/3 equals {1}, pi equals {0}'.format(one_third,math.pi))

1/3 equals 3.141592653589793, pi equals 0.3333333333333333


In [6]:
# Keyword arguments
print('1/3 equals {first}, pi equals {second}'.format(first=one_third,second=math.pi))

1/3 equals 0.3333333333333333, pi equals 3.141592653589793


In [7]:
# Pack characters into a string:
secret=[72,101,108,108,111,32,119,111,114,108,100,33] # ascii values of secret characters
print('-'.join([chr(s) for s in secret])) # '-' specifies how to separate characters

H-e-l-l-o- -w-o-r-l-d-!


In [8]:
# Replace certain characters:
'Hello world from room 650!'.replace('o','0')

'Hell0 w0rld fr0m r00m 650!'

#### *Timing*
*Two popular choices are* ***time*** *and* ***timeit***:

In [9]:
import time
print(time.time()) # time since January 1, 1970, 00:00:00 (UTC) in secons, aka UNIX time
print(time.time()/60/60/24/365) # approx. number of years passed

1600255523.843349
50.74376978195015


In [10]:
start=time.time()
time.sleep(1) # hang for 1 second
end=time.time()
print(end-start)

1.0059258937835693


In [11]:
%%timeit -n 2 # Jupyter notebook usage; include %% to time all the below contents of the current cell
time.sleep(0.1)
time.sleep(0.1)

207 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 2 loops each)


In [12]:
%timeit -n 2 time.sleep(0.1) # include % to time only this line
time.sleep(0.1)
time.sleep(0.1)

103 ms ± 823 µs per loop (mean ± std. dev. of 7 runs, 2 loops each)


#### *Custom functions*

In [13]:
# classic:
def sec2year(sec):
    years=sec/60/60/24//365 # '//' means integer division
    return years

# In python 3+, such simple functions can be defined with 'lambda' 
sec2year_smart=lambda x: x/60/60/24/365

In [14]:
print(sec2year(time.time())) # should be a whole number
print(sec2year_smart(time.time())) # should be rational

50.0
50.74376995947593


In [15]:
# Let's figure out which is faster:
%timeit -n 1000 -r 10 sec2year(time.time())
%timeit -n 1000 -r 10 sec2year_smart(time.time())

479 ns ± 149 ns per loop (mean ± std. dev. of 10 runs, 1000 loops each)
454 ns ± 137 ns per loop (mean ± std. dev. of 10 runs, 1000 loops each)


In [16]:
# Some arguments can have default values (should come after any non-default argument)
def divide(num,denom,whole=False):
    return num/denom if not whole else num//denom

In [17]:
# We may choose to pass them explicitly or leave to default
print(divide(10,3,True))
print(divide(10,3,False))
print(divide(10,3))

3
3.3333333333333335
3.3333333333333335


In [18]:
# Additional agruments can be passed by adding *args or **kwargs

def sum_and_shout(num,denom,message):
    result=num+denom
    print(message)
    return result
    
def divide_and_round(num,denom,decimals):
    result=num/denom
    return round(result,decimals)

def multiply_and_sleep(num,denom,sec):
    result=num*denom
    time.sleep(sec)
    return result

def router(num,denom,operation,**kwargs):
    if operation=='sum':
        return sum_and_shout(num,denom,kwargs['message'])
    elif operation=='divide':
        return divide_and_round(num,denom,kwargs['decimals'])
    elif operation=='sleep':
        return multiply_and_sleep(num,denom,kwargs['sec'])

In [19]:
router(5,2,'sum',message='WoooHooo!')

# Compare to:
# def router(num,denom,operation,message,decimals,secs)

# To execute sum via 'router', we would have needed:
# >>> dummy_decimals=3
# >>> dummy_sec=0.5
# >>> router(5,2,'sum',message='WoooHooo!',dummy_decimals,dummy_sec)

# Why don't we just call sum_and_shout(5,2,message='WoooHooo!')?
# The 'operation' flag could be given as a (command line) argumennt,
# so you need to "route" based on 'operation' to execute an appropriate function. 

WoooHooo!


7

#### *Data structures*
- *(1) Lists:*

In [20]:
# Definition
a=[1,3,5,7,9] # explicit
b=[i for i in range(1,10,2)] # range(start,end+1,step)
c=[i for i in range(10)][1:10:2] # slicing [start,end+1,step]
d=[i for i in range(10)][slice(1,10,2)] # slicing (start,end+1,step)
print('{}\n{}\n{}\n{}'.format(a,b,c,d))

[1, 3, 5, 7, 9]
[1, 3, 5, 7, 9]
[1, 3, 5, 7, 9]
[1, 3, 5, 7, 9]


In [21]:
# Append: O(1)
a=[]
a.append(1)
a.append(True)
a.append('WoooHooo!')
print(a) # lists can hold variables of different types

[1, True, 'WoooHooo!']


In [22]:
# Append: O(1)
for n in range(100000):
    if not n%10000:
        start=time.time()
        a.append(n)
        print(f'length: {len(a)}'+f', time: {round(time.time()-start,6)}')
    else:
        a.append(1)

length: 4, time: 3e-06
length: 10004, time: 6e-06
length: 20004, time: 6e-06
length: 30004, time: 7e-06
length: 40004, time: 6e-06
length: 50004, time: 9e-06
length: 60004, time: 5e-06
length: 70004, time: 7e-06
length: 80004, time: 5e-06
length: 90004, time: 4e-06


In [23]:
# Remove: O(n)
a=list(range(100001))
for n in range(100001):
    if not n%10000:
        start=time.time()
        a.remove(n)
        print(f'length: {len(a)}'+f', time: {round(time.time()-start,6)}')
    else:
        a.remove(n)

length: 100000, time: 9.3e-05
length: 90000, time: 5.4e-05
length: 80000, time: 3.9e-05
length: 70000, time: 0.000145
length: 60000, time: 0.000124
length: 50000, time: 2.6e-05
length: 40000, time: 2e-05
length: 30000, time: 1.6e-05
length: 20000, time: 1.2e-05
length: 10000, time: 1e-05
length: 0, time: 3e-06


In [24]:
# Other commonly used methods:

a=[('one',1),('two',2),('five',5),('four',4),('three',3)]
print(a.pop()) # removes and returns the last element, O(1)

('three', 3)


In [25]:
a=[('one',1),('two',2),('five',5),('four',4),('three',3)]
print(sorted(a)) # sorts the list, O(nlogn)

[('five', 5), ('four', 4), ('one', 1), ('three', 3), ('two', 2)]


In [26]:
a=[('one',1),('two',2),('five',5),('four',4),('three',3)]
print(sorted(a,key=lambda x: x[1])) # sorts the list according to the second element, O(nlogn)

[('one', 1), ('two', 2), ('three', 3), ('four', 4), ('five', 5)]


In [27]:
a=[('one',1),('two',2),('five',5),('four',4),('three',3)]
print('WoooHooo' in a) # checks if an object is in the list, O(n)

False


In [28]:
# Exercise: Given a list 's' of points on a plane and a positive integer k, write
# a function KClosest(s,k) that returns k of those points closest to the origin.
# Can you do it in one line (not counting the function header)?

# For testing:
s=[[1,1],[0,2],[0,1],[5,1],[4,0]]
k=2

- (2) *Dictionaries:*

In [29]:
# Definition
a={'a':1,'b':True,'c':-10}
print(a)

{'a': 1, 'b': True, 'c': -10}


In [30]:
# Insertion, O(1) (on average)
a['d']=5
print(a)

{'a': 1, 'b': True, 'c': -10, 'd': 5}


In [31]:
# Deletion, O(1) (on average)
del a['b']
print(a)

{'a': 1, 'c': -10, 'd': 5}


In [32]:
# Check, O(1) (on average)
print('z' in a) # or
print('z' in a.keys())

False
False


In [33]:
# Access, O(1) (on average)
print(a['d'])

5


In [34]:
# Sort by values, O(nlogn)
print(sorted(a.items(),key=lambda x:x[1]))

[('c', -10), ('a', 1), ('d', 5)]


In [35]:
# Why are dictionaries useful?
# (1) Reduced complexity of most operations compared to having two lists;
# (2) Convenience/Interpretability of the code

# Exercise 1 (Leetcode 136):
# Write a function 'single(a)' that takes an integer list/array 'a', in which
# all elements appear exactly twice except for one element that appears once.
# Output this one integer in O(n) complexity (n is the list length).
# Example: single([0,4,3,3,0]) outputs 4.
# For testing:
a=[7,2,9,1,2,9,1,10,10]

# Exercise 2 (Leetcode 451):
# Write a function 'frequency(s)' that sorts charcters of a string 's' in decreasing order
# based on the frequency of characters.
# Example: frequency("data") outputs "aadt" (or "aatd")
s="atlantic"

# Another good example is Leetcode 997.

#### *Other useful built-in functions*

In [36]:
# s.split(char): breaks a string s into a list of strings separated by char
s='Hello world from room 650!'
print(s.split(' '))

['Hello', 'world', 'from', 'room', '650!']


In [37]:
# s.join(list of strings): the opposite of split; joins all strings in the list and places s in between each pair
strs=['Hello', 'world', 'from', 'room', '650!']
print(' '.join(strs))

Hello world from room 650!


In [38]:
# sum(iterable): sums elements of a valid iterable (e.g. list, tuple, numpy array, etc.)
print(sum([1,2,3,4,5]))

15


In [39]:
# enumerate(iterable):  outputs an <enumerate> object with [index,element] pairs of an iterable
print([(i,v) for i,v in enumerate(['zero','one','two','three','four'])])

[(0, 'zero'), (1, 'one'), (2, 'two'), (3, 'three'), (4, 'four')]


In [40]:
# zip(iterable,iterable): outputs a <zip> with pairs of elements from iterables (must be of the same length)
vec_1=[1,2,3]
vec_2=[-1,0,1]
print(sum([a*b for a,b in zip(vec_1,vec_2)])) # dot product of vec_1 and vec_2

2


In [41]:
# Others like type(), len(), min(), max(), etc.

#### NumPy Module
NumPy is a python library for efficient and convenient work with arrays (ndarrays).
Unlike python lists, which in principle serve the same `purpose, numpy arrays are stored in contiguous memory space, making referencing/accessing faster.

In [44]:
# Numpy arrays from list:
import numpy as np

a=np.array([[1,1,1],[2,2,2]])
print(f'array:\n {a}\n')
print(f'shape: {a.shape}')

array:
 [[1 1 1]
 [2 2 2]]

shape: (2, 3)


In [45]:
# NumPy arrays from methods:
b=np.arange(10) # the same as range() but returns a ndarray
c=np.ones((2,3)) # make an array of all ones with a given shape
d=np.zeros((2,3)) # make an array of all zeros with a given shape
e=np.full((2,3),7) # make an array of all sevens with a given shape
print(f'{b}\n\n{c}\n\n{d}\n\n{e}')

[0 1 2 3 4 5 6 7 8 9]

[[1. 1. 1.]
 [1. 1. 1.]]

[[0. 0. 0.]
 [0. 0. 0.]]

[[7 7 7]
 [7 7 7]]


In [46]:
# NumPy arrays from other arrays:
print(np.hstack([c,d]))
print('\n')
print(np.vstack([c,d]))

[[1. 1. 1. 0. 0. 0.]
 [1. 1. 1. 0. 0. 0.]]


[[1. 1. 1.]
 [1. 1. 1.]
 [0. 0. 0.]
 [0. 0. 0.]]


In [47]:
# Reshaping arrays
print(a.reshape((3,2))) # traverses entries in order; not a transpose operation
print('\n')
print(np.transpose(a)) # this is a transpose operation
print('\n')
print(a.reshape(-1)) # flattening out

[[1 1]
 [1 2]
 [2 2]]


[[1 2]
 [1 2]
 [1 2]]


[1 1 1 2 2 2]


In [48]:
import numpy as np

# Arithmetic works better:
print([1,2,3]*3) # concatenates [1,2,3] with itself 3 times
print(np.array([1,2,3])*3) # elementwise product

[1, 2, 3, 1, 2, 3, 1, 2, 3]
[3 6 9]


In [49]:
# Elementwise addition:
print(np.array([1,2,3])+3) # or
print(np.array([1,2,3])+np.array([3,3,3]))

[4 5 6]
[4 5 6]


In [50]:
# Not supported for lists:
print([1,2,3]+3)

TypeError: can only concatenate list (not "int") to list

In [51]:
# Saving/Loading arrays:
a=np.array([1,2,3,4,5])
np.save('best_array_ever.npy',a)
a=np.load('best_array_ever.npy')
print(a)

[1 2 3 4 5]


In [52]:
# Other useful functions:
# np.mean(array,axis=?): average value across along a given axis,
# np.std(array,axis=?): standard deviation along a given axis,
# np.dot(array,array): dot product of two 1-dimensional arrays of the same length,
# np.matmul(array, array): matrix multiplication of two 2-dimensional arrays,
# np.nonzero(array): returns indices of nonzero elements of the array,
# np.where(condition(array)): returns indices of the array where elements satisfy the condition,
# np.argsort(array): returns indices in order that sorts the array.

#### NumPy Random

In [53]:
# Random permutation:
identity=np.arange(20)
sigma=np.random.permutation(identity)
print(sigma)
print(identity[sigma]) # the same

# Exercise: bring sigma back to identity; i.e find its inverse:
# >>> sigma_inverse=?
# >>> sigma[sigma_inverse]==identity
# >>> True

[14 16  4 15 17  6  7  2  9 13 10  5 11 19 18  1 12  8  0  3]
[14 16  4 15 17  6  7  2  9 13 10  5 11 19 18  1 12  8  0  3]


In [54]:
# Random choice:
no_fours=[0 if i==4 else 1/9 for i in range(10)]
np.random.choice(a=range(10),replace=True,size=20,p=no_fours) # 'p' gives probabilities of choices in 'a'

array([9, 6, 7, 5, 9, 8, 8, 1, 2, 1, 0, 8, 2, 6, 0, 8, 0, 5, 2, 2])

In [55]:
# Random distributions:
# np.random.normal(loc=0,scale=1,size=(5,5))
# np.random.poisson(lam=6,size=1)
# np.random.binomial(n=5,p=0.5)

In [57]:
# Exercise: You add Unif(0,1) variables until your sum exceeds 1.
# How many random variables, on average, will there be in your sum?

# Repeat this procedure many times, average the results and guess the answer.
%matplotlib inline
import matplotlib.pyplot as plt

In [58]:
# Reproducibility:
np.random.seed(22) # run this before each random generation to replicate the same "random" output
print('original: {}'.format(np.random.uniform(0,1)))
np.random.seed(22)
print('original: {}'.format(np.random.uniform(0,1)))
np.random.seed(22)
print('original: {}'.format(np.random.uniform(0,1)))
print('different: {}'.format(np.random.uniform(0,1))) # seed not set
np.random.seed(22)
print("original: {}".format(np.random.uniform(0,1)))
np.random.seed(23)
print('different: {}'.format(np.random.uniform(0,1))) # seed set to a different value

original: 0.20846053735884262
original: 0.20846053735884262
original: 0.20846053735884262
different: 0.4816810617633659
original: 0.20846053735884262
different: 0.5172978838465893


#### Pandas
A Python library for efficient storing, handlng, managing panel data.

In [60]:
import pandas as pd

from_list=[[3,1,4,1,5,9,2,6],[2,7,1,8,2,8,1,8],[1,6,1,8,0,3,3,9]]
#from_array=np.array([[3,1,4,1,5,9,2,6],[2,7,1,8,2,8,1,8],[1,6,1,8,0,3,3,9]])

digits=['one','two','three','four','five','six','seven','eight']
constants=pd.DataFrame(from_list,index=['pi','e','phi'],columns=digits)
constants

# pd.DataFrame(from_array,columns=['pi','e','phi'],index=digits) # won't work

Unnamed: 0,one,two,three,four,five,six,seven,eight
pi,3,1,4,1,5,9,2,6
e,2,7,1,8,2,8,1,8
phi,1,6,1,8,0,3,3,9


In [61]:
# DataFrame from Series or DataFrames:

pi=pd.Series([3,1,4,1,5,9,2,6],index=digits,name='pi') # note 'name', not 'columns' (because Series is 1D)
e=pd.Series([2,7,1,8,2,8,1,8],index=digits,name='e')
phi=pd.DataFrame([1,6,1,8,0,3,3,9],index=digits,columns=['phi'])
phi=pd.DataFrame([1,6,1,8,0,3,3,9],columns=['phi'],index=digits)

constants=pd.concat([pi,e,phi],axis=1)
constants

Unnamed: 0,pi,e,phi
one,3,2,1
two,1,7,6
three,4,1,1
four,1,8,8
five,5,2,0
six,9,8,3
seven,2,1,3
eight,6,8,9


In [62]:
# We can take a transpose:
constants=constants.T
constants

Unnamed: 0,one,two,three,four,five,six,seven,eight
pi,3,1,4,1,5,9,2,6
e,2,7,1,8,2,8,1,8
phi,1,6,1,8,0,3,3,9


In [63]:
# And rename columns/index:
constants=constants.rename(index={'pi':'pie'},columns={'one':'uno'}) # rename 

In [64]:
# Add another row:
constants=constants.append(pd.DataFrame([[1,4,1,4,2,1,3,5]],columns=constants.columns,index=['root2']))
constants

Unnamed: 0,uno,two,three,four,five,six,seven,eight
pie,3,1,4,1,5,9,2,6
e,2,7,1,8,2,8,1,8
phi,1,6,1,8,0,3,3,9
root2,1,4,1,4,2,1,3,5


In [65]:
# Add another column:
constants['cool?']=['No','Yes','Yes','Yes']

In [66]:
constants

Unnamed: 0,uno,two,three,four,five,six,seven,eight,cool?
pie,3,1,4,1,5,9,2,6,No
e,2,7,1,8,2,8,1,8,Yes
phi,1,6,1,8,0,3,3,9,Yes
root2,1,4,1,4,2,1,3,5,Yes


In [67]:
# Visualization:
constants.head(5) # top 5 rows
constants.tail(5) # bottom 5 rows
constants.info() # summary of the DataFrame; useful for large real-life data (potential nulls, nans, etc.)

<class 'pandas.core.frame.DataFrame'>
Index: 4 entries, pie to root2
Data columns (total 9 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uno     4 non-null      int64 
 1   two     4 non-null      int64 
 2   three   4 non-null      int64 
 3   four    4 non-null      int64 
 4   five    4 non-null      int64 
 5   six     4 non-null      int64 
 6   seven   4 non-null      int64 
 7   eight   4 non-null      int64 
 8   cool?   4 non-null      object
dtypes: int64(8), object(1)
memory usage: 320.0+ bytes


In [68]:
# Saving with Pandas:
constants.to_csv('best_table_ever.csv',sep='#') # note the delimiter

# Loading with Pandas:
new_table=pd.read_csv('best_table_ever.csv',header=0,index_col=0,sep='#') # note the delimiter
# specify indices of columns and row to use as names (here, 0 and 0)

new_table

Unnamed: 0,uno,two,three,four,five,six,seven,eight,cool?
pie,3,1,4,1,5,9,2,6,No
e,2,7,1,8,2,8,1,8,Yes
phi,1,6,1,8,0,3,3,9,Yes
root2,1,4,1,4,2,1,3,5,Yes


In [69]:
# Accessing elements in a DataFrame by names (use .loc[]):
print(new_table.loc['pie':'e']['uno']) # note: the last index is included!
print('\n')
print(new_table.loc['pie','uno':'three'])

pie    3
e      2
Name: uno, dtype: int64


uno      3
two      1
three    4
Name: pie, dtype: object


In [70]:
# Accessing elements in a DataFrame by indices (use .iloc[]):
print(new_table.iloc[0:2].iloc[0]) # here we first select DataFrame('pie' and 'e'), then DataFrame('pie')
print('\n')
print(new_table.iloc[0,0])

uno       3
two       1
three     4
four      1
five      5
six       9
seven     2
eight     6
cool?    No
Name: pie, dtype: object


3


In [71]:
# What if we have name for one dimension, but index for the other?
print(new_table.loc['pie'][0])
print(new_table.iloc[0]['uno'])

3
3


In [72]:
# Deleting rows:
display(new_table.drop('phi')) # pandas.drop only accepts names.
display(new_table.drop(new_table.index[2]))

Unnamed: 0,uno,two,three,four,five,six,seven,eight,cool?
pie,3,1,4,1,5,9,2,6,No
e,2,7,1,8,2,8,1,8,Yes
root2,1,4,1,4,2,1,3,5,Yes


Unnamed: 0,uno,two,three,four,five,six,seven,eight,cool?
pie,3,1,4,1,5,9,2,6,No
e,2,7,1,8,2,8,1,8,Yes
root2,1,4,1,4,2,1,3,5,Yes


In [73]:
# Deleting columns:
display(new_table.drop('five',axis=1)) # by default, axis=0
display(new_table.drop(new_table.columns[2],axis=1))

Unnamed: 0,uno,two,three,four,six,seven,eight,cool?
pie,3,1,4,1,9,2,6,No
e,2,7,1,8,8,1,8,Yes
phi,1,6,1,8,3,3,9,Yes
root2,1,4,1,4,1,3,5,Yes


Unnamed: 0,uno,two,four,five,six,seven,eight,cool?
pie,3,1,1,5,9,2,6,No
e,2,7,8,2,8,1,8,Yes
phi,1,6,8,0,3,3,9,Yes
root2,1,4,4,2,1,3,5,Yes


#### PyTrends
A python tool to extract Google Trends data as Pandas dataframes.

In [74]:
import pandas as pd
from pytrends.request import TrendReq

pt=TrendReq(hl='en-US',retries=10) # host language, retries.

# Google trends normalizes all trends data between 0 and 100 and accounts for
# the number of total searches at a given time and location. Google trends releases
# only relative interest magnitudes.

# Example:
search_words=["clinton","trump","pandemic"] # terms of interest
pt.build_payload(search_words,timeframe='2016-10-31 2016-11-07',geo='US') # temporal and spatial scope
data=pt.interest_by_region(resolution='REGION')
data.head(5) # for each region, the sum is 100, distributed among search expressions.

Unnamed: 0_level_0,clinton,trump,pandemic
geoName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Alabama,42,58,0
Alaska,38,62,0
Arizona,39,61,0
Arkansas,44,56,0
California,37,63,0


#### Mini-project:
*Use PyTrends to model proportion of votes cast for Trump in each state.*
- What search expressions should we fetch as predictors?
- Temporal scope?
- Possible bias?

In [75]:
search_words=[["Trump"],["Clinton"]]
scope='2016-10-31 2016-11-07'

In [76]:
# Let's preprocess the electoral data:
data=pd.read_csv('electoral_data.csv',header=1,sep=';',error_bad_lines=False)
data

Unnamed: 0.1,Unnamed: 0,Trump (R),Clinton (D),Trump (R).1,Clinton (D).1,All Others,Total Vote,Unnamed: 7
0,AL,9,,1 318 255,729 547,75 570,2 123 372,AL
1,AK,3,,163 387,116 454,38 767,318 608,AK
2,AZ,11,,1 252 401,1 161 167,159 597,2 573 165,AZ
3,AR,6,,684 872,380 494,65 310,1 130 676,AR
4,CA,,55,4 483 814,8 753 792,943 998,14 181 604,CA
5,CO,,9,1 202 484,1 338 870,238 893,2 780 247,CO
6,CT,,7,673 215,897 572,74 133,1 644 920,CT
7,DE,,3,185 127,235 603,23 084,443 814,DE
8,DC,,3,12 723,282 830,15 715,311 268,DC
9,FL,29,,4 617 886,4 504 975,297 178,9 420 039,FL


In [77]:
# Dropping extra columns and rows:
data.drop(data.columns[[1,2,-1,-3]],axis=1,inplace=True)
data.drop(data.index[51:59],inplace=True)

# Rename columns:
new_columns=['state','trumpRes','clintonRes','total']
data.rename(columns={name:new_columns[i] for i,name in enumerate(data.columns)},inplace=True)
data.head(5)

Unnamed: 0,state,trumpRes,clintonRes,total
0,AL,1 318 255,729 547,2 123 372
1,AK,163 387,116 454,318 608
2,AZ,1 252 401,1 161 167,2 573 165
3,AR,684 872,380 494,1 130 676
4,CA,4 483 814,8 753 792,14 181 604


In [78]:
# Let's check there are no Nans:
data.isna().any()

state         False
trumpRes      False
clintonRes    False
total         False
dtype: bool

In [79]:
# There's is one hidden problem:
display(data.loc[:,'trumpRes'].iloc[0:5]) # seems ok;
print(type(data.loc[:,'trumpRes'].iloc[0])) # entries are actually strings...
display(data.loc[:,'trumpRes'].iloc[0]) # '\xa0' is some kind of code for the space character

0    1 318 255
1      163 387
2    1 252 401
3      684 872
4    4 483 814
Name: trumpRes, dtype: object

<class 'str'>


'1\xa0318\xa0255'

In [80]:
# Let's fix this:
data.loc[:,'trumpRes']=data.loc[:,'trumpRes'].apply(lambda x:  x.replace(u'\xa0','')).astype(int) # 'u' stands for unicode
data.loc[:,'total']=data.loc[:,'total'].apply(lambda x:  x.replace(u'\xa0','')).astype(int)

In [81]:
# Check:
print(type(data.loc[:,'total'].iloc[0]))

<class 'numpy.int64'>


In [82]:
# Add data from PyTrends:

for sw in search_words:
    pt.build_payload(sw,timeframe=scope,geo='US')
    data[sw]=pt.interest_by_region(resolution='REGION').values # values needed not to worry about index mismatch
data.head(5)

Unnamed: 0,state,trumpRes,clintonRes,total,Trump,Clinton
0,AL,1318255,729 547,2123372,67,76
1,AK,163387,116 454,318608,73,72
2,AZ,1252401,1 161 167,2573165,78,81
3,AR,684872,380 494,1130676,70,86
4,CA,4483814,8 753 792,14181604,73,67


In [83]:
# Use this if PyTrends fail:

# data.to_csv('clean_data.csv')
# data=pd.read_csv('clean_data.csv',header=0,index_col=0)

In [84]:
# Compute votes proportion:
data['trumpProp']=data.loc[:,'trumpRes']/data.loc[:,'total']
data.head(5)

Unnamed: 0,state,trumpRes,clintonRes,total,Trump,Clinton,trumpProp
0,AL,1318255,729 547,2123372,67,76,0.620831
1,AK,163387,116 454,318608,73,72,0.512815
2,AZ,1252401,1 161 167,2573165,78,81,0.486716
3,AR,684872,380 494,1130676,70,86,0.605719
4,CA,4483814,8 753 792,14181604,73,67,0.316171


In [85]:
# Drop unnecesary columns:
data.drop(data.columns[:4],axis=1,inplace=True)
data.head(5)

Unnamed: 0,Trump,Clinton,trumpProp
0,67,76,0.620831
1,73,72,0.512815
2,78,81,0.486716
3,70,86,0.605719
4,73,67,0.316171


In [86]:
# Train/test split:
num=int(len(data)*0.75)
train=data.iloc[:num]
test=data.iloc[num:]

In [87]:
# MinMax standardize all features:
from sklearn.preprocessing import MinMaxScaler

X_scaler=MinMaxScaler()
y_scaler=MinMaxScaler()

X_train=X_scaler.fit_transform(train.iloc[:,:-1])
y_train=y_scaler.fit_transform(pd.DataFrame(train.iloc[:,-1])) # need to pass a 2D ndarray, so wrap in a DF;
X_test=X_scaler.transform(test.iloc[:,:-1])
y_test=y_scaler.transform(pd.DataFrame(test.iloc[:,-1]))

display(X_train[:5]) # now these are numpy ndarrays
display(y_train[:5])

ModuleNotFoundError: No module named 'sklearn'

In [None]:
# Now, model!
from sklearn.linear_model import LinearRegression as LR
model=LR().fit(X_train,y_train)

prediction=model.predict(X_test)
actual=y_test

In [None]:
# Scale back to original units:

prediction=y_scaler.inverse_transform(prediction)
actual=y_scaler.inverse_transform(actual)

In [None]:
# Plot the results:
%matplotlib inline
import matplotlib.pyplot as plt

plt.plot(range(len(test)),prediction,color='fuchsia',label='predicted')
plt.plot(range(len(test)),actual,color='turquoise',label='actual')
plt.legend()
plt.xlabel('index')
plt.ylabel('proportion of votes for Trump')
plt.grid()
plt.show()