In [1]:
# import basic modules
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import datetime
from nltk.corpus import stopwords
import pickle
import dill 

# import specialised modules
from sklearn.linear_model import LinearRegression

In [2]:
# Get data
with open("../data/good_words_text", "rb") as fb:
    good_words_text = pickle.load(fb)

with open("../data/good_words_title", "rb") as fb:
    good_words_title = pickle.load(fb)

with open("../data/training_data", "rb") as fb:
    training_data = pickle.load(fb)

In [3]:
# Functions for feature vectors

def less_than_date(input):
    x = input.dates < datetime.datetime(2013, 1, 1)
    return [int(k) for k in x]

def data_as_var(input):
    return [x if x >= 0 else 0 for x in input.time]

def has_url(input):
    x = input.url.isna()
    return [int(k) for k in x]

def has_title(input):
    x = input.title.isna()
    return [int(k) for k in x]

def has_text(input):
    x = input.text.isna()
    return [int(k) for k in x]

def contains_word_text(input, word):
    out = []
    for(index, row) in input.iterrows():
        if isinstance(row.text, str):
            out.append(int(word in row.text.split(" ")))
            continue
        out.append(0)
    return out

def contains_word_title(input, word):
    out = []
    for(index, row) in input.iterrows():
        if isinstance(row.text, str):
            out.append(int(word in row.title.split(" ")))
            continue
        out.append(0)
    return out

In [4]:
def create_x(input, good_words):
    t = less_than_date(input)
    contains_url = has_url(input)
    contains_title = has_title(input)
    contains_words = [contains_word_text(input, word) for word in good_words[:50]] 

    final = []
    for i in range(0, len(t)):
        temp = []
        temp.append(1)
        temp.append(t[i])
        temp.append(1 - t[i])
        temp.append(contains_url[i])
        temp.append(contains_title[i])
        for l in contains_words:
            temp.append(l[i])
        final.append(temp)
    return final

def create_x2(input, good_words):
    t = less_than_date(input)
    contains_url = has_url(input)
    contains_title = has_title(input)
    contains_text = has_text(input)
    contains_words = [contains_word_title(input, word) for word in good_words[:10]] 

    final = []
    for i in range(0, len(t)):
        temp = []
        temp.append(1)
        temp.append(t[i])
        temp.append(contains_url[i])
        temp.append(contains_title[i])
        temp.append(contains_text[i])
        for l in contains_words:
            temp.append(l[i])
        final.append(temp)
    return final

def create_x3(input, good_words):
    t = data_as_var(input)
    contains_url = has_url(input)
    contains_title = has_title(input)
    contains_text = has_text(input)
    contains_words = [contains_word_title(input, word) for word in good_words[:10]] 

    final = []
    for i in range(0, len(t)):
        temp = []
        temp.append(1)
        temp.append(t[i])
        temp.append(contains_url[i])
        temp.append(contains_title[i])
        temp.append(contains_text[i])
        for l in contains_words:
            temp.append(l[i])
        final.append(temp)
    return final

In [7]:
def train_model(input, x_fun, word_list):
    model = LinearRegression()
    y_output = list(input.score)
    y_output = [x if x >= 0 else 0 for x in y_output]
    model.fit(x_fun(input, word_list), y_output)
    return model

In [8]:
model1 = train_model(training_data.loc[training_data.type == "story"], create_x, good_words_text)

In [9]:
model2 = train_model(training_data.loc[training_data.type == "story"], create_x2, good_words_text)

In [10]:
model3 = train_model(training_data.loc[training_data.type == "story"], create_x3, good_words_title)

In [11]:
models = []
models.append(model1)
models.append(model2)
models.append(model3)

In [12]:
functions = []
functions.append(create_x)
functions.append(create_x2)
functions.append(create_x3)

In [13]:
with open("functions", "wb") as fb:
    dill.dump(functions, fb)

In [14]:
with open("models", "wb") as fb:
    pickle.dump(models, fb)