# Youtube experiment

## Import the libraries

In [8]:
import pandas as pd
import seaborn as sns
import  matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

## Data preparation

In [3]:
# read the dataset
rd = pd.read_csv('USvideos.csv')

# change booleans to binary
rd = rd.assign(comments_disabled = [1 if rd['comments_disabled'][each] == True else 0 for each in range(0, len(rd['comments_disabled']))])
rd = rd.assign(ratings_disabled = [1 if rd['ratings_disabled'][each] == True else 0 for each in range(0, len(rd['ratings_disabled']))])
rd = rd.assign(video_error_or_removed = [1 if rd['video_error_or_removed'][each] == True else 0 for each in range(0, len(rd['video_error_or_removed']))])

# train test
y = rd.likes
x = rd.drop(['ratings_disabled', 'comments_disabled', 'video_error_or_removed', 'comment_count', 'thumbnail_link', 'description', 'publish_time', 'tags', 'category_id', 'channel_title', 'video_id', 'trending_date', 'likes', 'title'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)

## Regression Model

In [4]:
# random forest regression
rfc = RandomForestRegressor(n_estimators=20, min_samples_split=4)
rfc.fit(x_train, y_train)
pred = rfc.predict(x_test)

## Evaluation and Visualization

In [9]:
# r-squared and mse
r2 = r2_score(y_test, pred)
mse = mean_squared_error(y_test, pred)

print("mse: %0.3f" % mse)
print("r-squared: %0.3f" % r2)
print("feature-importances:", rfc.feature_importances_)

# convert the feature_importance to a matrix
fi = pd.DataFrame(rfc.feature_importances_)
fi.plot()
plt.show()

# visualize the correlation
corr = rd.corr()
sns.heatmap(corr,xticklabels=corr.columns.values,yticklabels=corr.columns.values)

mse: 4676365368.570
r-squared: 0.780
feature-importances: [0.78801486 0.21198514]


<matplotlib.axes._subplots.AxesSubplot at 0x112552fd0>