# Combined price features with tfidf matrix to build regression models

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
import os
import re
from sklearn.linear_model import LinearRegression,Ridge,Lasso,RidgeCV,LassoCV
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import RandomForestRegressor

In [2]:
def remove_Outliers(dataframe, column_name, outlierConstant = 1.5):
    a = np.array(dataframe[column_name])
    upper_quartile = np.percentile(a, 75)
    lower_quartile = np.percentile(a, 25)
    IQR = (upper_quartile - lower_quartile) * outlierConstant
    quartileSet = (lower_quartile - IQR, upper_quartile + IQR)
    index_list = []
    for i in range(0,len(a)):
        value = a[i]
        if value <= quartileSet[0] or value >= quartileSet[1]:
            index_list.append(i)            
    new_dataframe = dataframe[~dataframe.index.isin(index_list)].reset_index(drop = True)    
    return new_dataframe

In [3]:
stop_words_list = []
with open("stop_word_new.txt", "r") as f:
    for line in f:
        stop_words_list.append(str(line.strip()))

In [4]:
# Pre-process file, set exclude_digit = True if do not include digit 
def pre_process(file_content, exclude_digit):
    processed_article = file_content.lower()
    # Decide whether to exclude the digit or not 
    if exclude_digit == False:    # Include digit
        processed_article = re.sub(',', '', processed_article )
        processed_article = re.sub('[^a-zA-Z0-9]', ' ', processed_article )
    else:                         # Exclude digit
        processed_article = re.sub('[^a-zA-Z]', ' ', processed_article )

    processed_article = re.sub(r'\s+', ' ', processed_article)
        
    return processed_article.split()

In [5]:
def Exclude_Stop_words (processed_article, stop_words_list):
    all_words_0 = processed_article       
#     all_words_1 = [word for word in all_words_0 if len(word) >4 ]
    all_words_2 = [word for word in all_words_0 if word not in stop_words_list]
    str = ' '
    return str.join(all_words_2)

In [6]:
def pre_encode_tfidf(path, files):
    X = []
    for filename in files:
        if filename != '.DS_Store':
            with open(path+'/'+filename,'r') as file:
#                 print(filename)
                pre_processed_article = file.read()[3000:]
                processed_article = pre_process(pre_processed_article, False)
                X.append(Exclude_Stop_words(processed_article, stop_words_list))
    return X

In [7]:
def model(X,y, model_name):
    
    if model_name == 'linear':
        model = LinearRegression().fit(X, y)
    if model_name == 'ridge':
        model = Ridge(alpha=0.0001).fit(X, y)
    if model_name == 'lasso':
        model = Lasso(alpha=0.1, max_iter = 100000).fit(X, y)
    if model_name == 'RandomForest':
        model = RandomForestRegressor(n_estimators = 500, random_state = 1).fit(X, y)
        
    return model

In [8]:
def evaluation_model(model, x_train, y_train):
    cv_mse = cross_val_score(model,x_train, y_train, cv=10, scoring='neg_mean_squared_error' )
    cv_mbe = cross_val_score(model,x_train, y_train, cv=10, scoring='neg_mean_absolute_error' )
    cv_r2 = cross_val_score(model,x_train, y_train, cv=10, scoring='r2' )
    print('CV mse is',round(np.mean(cv_mse),4))
    print('CV mbe is',round(np.mean(cv_mbe),4))
    print('CV R^2 is',round(np.mean(cv_r2),4))
    return(round(np.mean(cv_mse),4))

In [9]:
def word_embedding_all(file,path):
    d1 = pd.read_csv(file)
    d2 = remove_Outliers(d1,'close_adjusted_x')
    d3 = remove_Outliers(d2,'close_adjusted_y')
    files = d3['filename'].tolist()
    y = d3['close_adjusted_y']
    X0 = pre_encode_tfidf(path, files)
    vectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
    X = vectorizer.fit_transform(X0)
    X.toarray()
    X = pd.DataFrame(X.toarray())
    X['high'] = d3['high']
    X['low'] = d3['low']
    X['close_adjusted_x'] = d3['close_adjusted_x']
    return X,y

In [10]:
# find best parameters for ridge, lasso and randomforest
def choose_parameter(x_train, y_train):
    alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
    ridgecv = RidgeCV(alphas = alphas, scoring = 'neg_mean_squared_error', normalize = True)
    ridgecv.fit(x_train, y_train)
    print(ridgecv.alpha_)
    lassocv = LassoCV(alphas = alphas, cv = 10, normalize = True, max_iter = 100000)
    lassocv.fit(x_train, y_train)
    print(lassocv.alpha_)
    num_tree = [50, 100, 200, 500]
    for tree in num_tree:
        print(tree)
        model = RandomForestRegressor(n_estimators = tree, random_state = 1)
        print('1')
        model.fit(x_train, y_train)
        print('2')
        evaluation = evaluation_model(model, x_train, y_train)
    print(evaluation)

## using 2002 find best parameters for ridge, lasso and randomforest

In [11]:
# path = '/Users/faustune/Desktop/data/2002.full'
# file = '2002_10k_2003_price_all_features.csv'
# X,y = word_embedding_all(file, path)
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
# choose_parameter(X_train, y_train)

In [12]:
f = ['1998_10k_1999_price_all_features.csv','1999_10k_2000_price_all_features.csv','2000_10k_2001_price_all_features.csv',
    '2001_10k_2002_price_all_features.csv','2002_10k_2003_price_all_features.csv','2003_10k_2004_price_all_features.csv',
    '2004_10k_2005_price_all_features.csv','2005_10k_2006_price_all_features.csv','2006_10k_2007_price_all_features.csv']
p = '/Users/faustune/Desktop/data/'

# Train model & calculate MSE

In [13]:
mse = dict()
for file in f:
    year = file[0:4]
    path = p + year + '.full'
    X,y = word_embedding_all(file,path)
#     print('embed')
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33)
#     print('split')
    reg = model(X_train,y_train, model_name = 'linear')
    y_pred1 = reg.predict(X_test)
    mse[year].append(mean_squared_error(y_test, y_pred1))
#     print(year,'reg')

    ridge = model(X_train,y_train, model_name = 'ridge')
    y_pred2 = ridge.predict(X_test)
    mse[year].append(mean_squared_error(y_test, y_pred2))
#     print(year,'ridge')

    lasso = model(X_train,y_train, model_name = 'lasso')
    y_pred3 = lasso.predict(X_test)
    mse[year].append(mean_squared_error(y_test, y_pred3))
#     print(year,'lasso')

    rf = model(X_train,y_train, model_name = 'RandomForest')
    y_pred4 = rf.predict(X_test)
    mse[year].append(mean_squared_error(y_test, y_pred4))
#     print(year,'rf')
#     print(mse)

In [14]:
# print(mse_reg, mse_ridge, mse_lasso, mse_rf)

In [15]:
df = pd.DataFrame.from_dict(mse, orient='index', columns=['linear', 'ridge', 'lasso', 'forest'])
df.to_csv("tfidf_all_feature_mse.csv")