# Youtube Experiment - Regression Model

### Import the libraries

In [None]:
# This Source Code Form is subject to the terms of the MIT
# License. If a copy of the same was not distributed with this
# file, You can obtain one at
# https://github.com/akhilpandey95/reproducibility/blob/master/LICENSE.

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

%matplotlib inline

### Read the dataset

In [None]:
# read the dataset
rd = pd.read_csv('USvideos-with-transcripts.csv')

### Drop the rows with NaN

In [None]:
# we are going to remove all the NaN's from dataset
rd = rd[rd.views != np.nan]

# reset the index
rd = rd.reset_index(drop=True)

# print few rows
rd.head()

### Test train split

In [None]:
# train test
y = rd.likes
x = rd.drop(['video_transcript_en' , 'ratings_disabled', 'comments_disabled', 'video_error_or_removed', 'thumbnail_link', 'description', 'publish_time', 'tags', 'category_id', 'channel_title', 'video_id', 'trending_date', 'likes', 'title', 'Unnamed: 0'], axis=1)

# we are going to remove all the NaN's from dataset
x = x[x.views != np.nan]

# reset the index
x = x.reset_index(drop=True)

# split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

### Visualize the dataset

In [None]:
sns.lmplot(x='views', y='likes', data=rd)

### Build the regression Model

In [None]:
x

In [None]:
# random forest regression
rfc = RandomForestRegressor(n_estimators=20, min_samples_split=4)
rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)

### Metrics for evaluating the model

In [None]:
# r-squared and mse
r2 = r2_score(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("mse: %0.3f" % mse)
print("r-squared: %0.3f" % r2)
print("feature-importances:", rfc.feature_importances_)

# convert the feature_importance to a matrix
fi = pd.DataFrame(rfc.feature_importances_)
fi.plot()
plt.show()

# visualize the correlation
corr = rd.corr()
sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values)