In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import os
from datetime import datetime, timedelta

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import cross_val_score
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

%matplotlib inline

In [None]:
def extract_time(row):
    # Convert to pacific time
    date = datetime.strptime(row.date, '%Y-%m-%dT%H:%M:%S.%fZ') + timedelta(hours=-7)
    return date.time().replace(second=0, microsecond=0)

def extract_seconds(row):
    time = row.time
    return (time.hour * 60 + time.minute)

votes = pd.read_csv("all_extrap.csv")
votes = votes.set_index('post_id')
# votes['time'] = votes.apply(lambda row: extract_time(row), axis=1)
# votes['second'] = votes.apply(lambda row: extract_seconds(row), axis=1)
votes['score_2'] = votes.apply(lambda row: row.score * row.score, axis=1)
pd.set_option('display.max_colwidth', -1)
pd.set_option('display.max_rows', 500)

In [None]:
def train_model(votes, from_band, to_band):
    data = votes[(votes.band == from_band)].join(votes[(votes.band == to_band)].score, how='inner', rsuffix='_final')   
    learn_from = ['score']
    train_set, test_set = train_test_split(data, test_size=0.2, random_state=42)
    lin_reg = LinearRegression()
    lin_reg.fit(train_set[learn_from], train_set.score_final.values.reshape(-1, 1))
    predictions = lin_reg.predict(test_set[learn_from])
    return {
        'coeff': [lin_reg.intercept_[0], lin_reg.coef_[0][0]],
        'rms': np.sqrt(mean_squared_error(test_set.score_final.values.reshape(-1, 1), predictions))
    } 

In [None]:
def get_split_values(data):
    split_bins = []
    for x in range(288):
        split_value = np.histogram(data[data.band == x].score)[1][1]
        split_bins.append(split_value)

    return (split_bins)
    
split_values = get_split_values(votes)

low_error = 0
all_low_predictions = []

high_error = 0
all_high_predictions = []

total_error = 0
all_predictions = []
for x in range(288):
    low_predictions = []
    low_model = votes.loc[votes[(votes.band == x) & (votes.score < split_values[x])].index]
    
    high_predictions = []
    high_model = votes.loc[votes[(votes.band == x) & (votes.score >= split_values[x])].index]

    predictions = []
    
    for y in range(x + 1, 288):
        res = train_model(low_model, x, y)
        low_predictions.append(res['coeff']) 
        low_error += res['rms']

        res = train_model(high_model, x, y)
        high_predictions.append(res['coeff']) 
        high_error += res['rms']

        res = train_model(votes, x, y)
        predictions.append(res['coeff']) 
        total_error += res['rms']

        
    all_low_predictions.append(low_predictions)
    all_high_predictions.append(high_predictions)
    all_predictions.append(predictions)

print("Error is (low, high, overall) ", low_error, high_error, total_error)

In [None]:
import json
with open('coeff_split.json', 'w') as outfile:
    json.dump({
        'low': all_low_predictions,
        'high': all_high_predictions,
        'all': all_predictions,
        'split': split_values
    }, outfile)

In [None]:
from sklearn.model_selection import cross_val_score
lin_reg = LinearRegression()
learn_from = ['score']
scores = cross_val_score(lin_reg, data[learn_from], data.score_final.values.reshape(-1, 1), scoring="neg_mean_squared_error", cv=10)
scores = np.sqrt(-scores)
print ("Scores: ", scores)
print ("Mean: ", scores.mean())
print ("std dev: ", scores.std())

In [None]:
votes[(votes.band == 100)].score.hist()

In [None]:
count = np.histogram(votes[(votes.band == 287)].score)
print(count[0][0])

In [None]:
votes.loc[votes[(votes.band == 35) & (votes.score < 50) & (votes.scoreAt24h > 8000)].index].plot(x='band', y='score', figsize=(15,5), style='.')

In [None]:
votes[(votes.band == 35) & (votes.score < 50) & (votes.scoreAt24h > 8000)].index

In [None]:
corr = votes.loc[:,votes.dtypes != 'object'].corr()
sns.heatmap(corr, xticklabels=corr.columns, yticklabels=corr.columns, cmap=sns.diverging_palette(220, 10, as_cmap=True))

In [None]:
votes.loc['/r/dataisbeautiful/comments/cq2ng5/oc_game_of_thrones_ratings/'][['band','score']].to_json(path_or_buf='values.json',orient='values')

In [None]:
import matplotlib.pyplot as plt
df = pd.DataFrame()
src = votes.loc['/r/dataisbeautiful/comments/cq2ng5/oc_game_of_thrones_ratings/']
df['band'] = src['band']
df['score'] = src['score']

band = 2
model = all_predictions[band]
t = range(band, 287, 1)
s = map(lambda x: model[x - band][0] + model[x  - band][1] * df.score[2], t) 
# fig, ax = plt.subplots()
# ax = df.plot(ax=ax, kind='line', x='band', y='score')
# ax.plot(t, s, color='#0033ff22')
# plt.show()

In [None]:
all_predictions[2][284]

In [None]:
s

In [None]:
len(df.score)