In [None]:
# Prep environment:  setup commandline method and import necessary libraries

from __future__ import absolute_import, division, print_function, unicode_literals
from google.colab import files
from google.colab import drive
import tarfile
import os
import json
import requests
import sys
import shutil
import re
from tqdm import tqdm, trange
import numpy as np
import tensorflow as tf
from tensorflow.core.protobuf import rewriter_config_pb2
from tensorflow.python.client import device_lib
import time
from datetime import datetime
import csv
import argparse
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
import tensorflow.compat.v2.feature_column as fc
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.exceptions import NotFittedError
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_absolute_error

# Run and print a shell command.
def run(cmd):
  print('>> {}'.format(cmd))
  !{cmd}  # This command runs the user input as a command in the shell.  This code cell can be reused for other necessary activities in the shell.

# first run pip to install necessary modules
!pip install -q sklearn

# add tensor flow 
%tensorflow_version 2.x 

file_name = 'init.csv'  #@param {type: "string"}
drive.mount('/content/drive')
run('mkdir -p src/')
run('ls -lh src/')

Mounted at /content/drive
>> mkdir -p src/
>> ls -lh src/
total 0


In [None]:
# transfer training file to notebook directory
file_dest = "src/" + file_name
shutil.copyfile(src="/content/drive/My Drive/" + file_name, dst=file_dest)
run('ls -lh src/')

>> ls -lh src/
total 16K
-rw-r--r-- 1 root root 15K Apr 15 05:44 init.csv


In [None]:
# Load dataset.
dftrain = pd.read_csv(file_dest)

In [None]:
dftrain.head()

Unnamed: 0,ANGER,DISGUST,FEAR,JOY,SADNESS,TENTATIVE,ANALYTICAL,CONFIDENT,VIEWS
0,0.576674,0,0.0,0.0,0.0,0.700148,0.587578,0.793762,4034155
1,0.0,0,0.0,0.697598,0.604259,0.797051,0.749305,0.865994,368612
2,0.584267,0,0.0,0.0,0.58147,0.727649,0.578724,0.808171,5498684
3,0.597095,0,0.0,0.553043,0.81396,0.599641,0.57043,0.749891,138764
4,0.618312,0,0.0,0.0,0.0,0.789754,0.575111,0.751512,150865


In [None]:
dftrain.tail()

Unnamed: 0,ANGER,DISGUST,FEAR,JOY,SADNESS,TENTATIVE,ANALYTICAL,CONFIDENT,VIEWS
221,0.56736,0,0.0,0.0,0.0,0.660003,0.602623,0.848848,168483
222,0.596254,0,0.0,0.0,0.0,0.706297,0.595788,0.782686,279623
223,0.0,0,0.0,0.651814,0.614615,0.840001,0.654551,0.833419,73882
224,0.571727,0,0.0,0.0,0.0,0.712449,0.589308,0.889641,116713
225,0.577606,0,0.0,0.0,0.0,0.671881,0.622282,0.831641,146227


In [None]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 226 entries, 0 to 225
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   ANGER       226 non-null    float64
 1   DISGUST     226 non-null    int64  
 2   FEAR        226 non-null    float64
 3   JOY         226 non-null    float64
 4   SADNESS     226 non-null    float64
 5   TENTATIVE   226 non-null    float64
 6   ANALYTICAL  226 non-null    float64
 7   CONFIDENT   226 non-null    float64
 8   VIEWS       226 non-null    int64  
dtypes: float64(7), int64(2)
memory usage: 16.0 KB


In [None]:
dftrain.describe()

Unnamed: 0,ANGER,DISGUST,FEAR,JOY,SADNESS,TENTATIVE,ANALYTICAL,CONFIDENT,VIEWS
count,226.0,226.0,226.0,226.0,226.0,226.0,226.0,226.0,226.0
mean,0.459608,0.0,0.135158,0.379476,0.314625,0.672091,0.63881,0.635032,2730170.0
std,0.257486,0.0,0.277992,0.334405,0.306333,0.226021,0.099838,0.296438,4590516.0
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,231.0
25%,0.544876,0.0,0.0,0.0,0.0,0.679647,0.575079,0.656784,181007.8
50%,0.577548,0.0,0.0,0.585407,0.517019,0.719143,0.591673,0.757996,1068544.0
75%,0.593131,0.0,0.0,0.682706,0.588549,0.787256,0.713815,0.800187,3053580.0
max,0.931034,0.0,0.931034,0.880435,0.916667,0.984352,0.997932,0.942582,41558120.0


In [None]:
x = dftrain.drop('VIEWS', axis=1)
y = dftrain['VIEWS']

In [None]:
x_train,  x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=1234)

In [None]:
print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

(158, 8) (68, 8) (158,) (68,)


In [None]:
pipelines = {
    'rf':make_pipeline(RandomForestRegressor(random_state=1234)),
    'gb':make_pipeline(GradientBoostingRegressor(random_state=1234)),
    'ridge':make_pipeline(Ridge(random_state=1234)),
    'lasso':make_pipeline(Lasso(random_state=1234)),
    'enet':make_pipeline(ElasticNet(random_state=1234)),
}

In [None]:
hypergrid = {
    'rf': {
        'randomforestregressor__min_samples_split':[2,4,6],
        'randomforestregressor__min_samples_leaf':[1,2,3]
    },
    'gb':{
        'gradientboostingregressor__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'ridge':{
        'ridge__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'lasso':{
        'lasso__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    },
    'enet':{
        'elasticnet__alpha':[0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 0.99]
    }
}

In [None]:
fit_models = {}
for algo, pipeline in pipelines.items():
    model = GridSearchCV(pipeline, hypergrid[algo], cv=10, n_jobs=-1)
    try:
        print('Starting training for {}.'.format(algo))
        model.fit(x_train, y_train)
        fit_models[algo] = model
        print('{} has been successfully fit.'.format(algo))
    except NotFittedError as e:
        print(repr(e))


Starting training for rf.
rf has been successfully fit.
Starting training for gb.
gb has been successfully fit.
Starting training for ridge.
ridge has been successfully fit.
Starting training for lasso.
lasso has been successfully fit.
Starting training for enet.
enet has been successfully fit.


In [None]:
for algo,model in fit_models.items():
    yhat = model.predict(x_test)
    print('{} scores - R2:{} MAE:{}'.format(algo, r2_score(y_test, yhat), mean_absolute_error(y_test, yhat)))

rf scores - R2:-0.03187040258222984 MAE:3260354.8621074716
gb scores - R2:0.09778305878187799 MAE:3298714.1703288467
ridge scores - R2:-0.04174772661925652 MAE:3187476.559083056
lasso scores - R2:-0.04522098218969095 MAE:3178898.0400933195
enet scores - R2:-0.02106120162627767 MAE:3168075.51445737


In [None]:
best_model = fit_models['gb']