Name : Charles Patel <br>
Email: charlespatel007@yahoo.com

In [None]:
! pip install -r requirements.txt

In [None]:
import random
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt 

from typing import Dict, List
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.linear_model import LinearRegression, Lasso, RidgeCV
from sklearn.model_selection import train_test_split, GridSearchCV

### Data

In [None]:
articles = pd.read_csv('data/shared_articles.csv')
interactions = pd.read_csv('data/users_interactions.csv')
articles_info = pd.read_csv('data/article_info.csv') # features extracted from article text

#### Generated features from each article text

In [None]:
articles_info

In [None]:
article_data = pd.merge(left=articles, 
                        right=articles_info, 
                        left_on='contentId', 
                        right_on='contentId',
                        how='left')
article_data = article_data.drop_duplicates()

In [None]:
article_data.head()

In [None]:
interactions.head()

In [None]:
interactions.info()

### Extracting features from user interaction

In [None]:
interactions['activity_count'] = 1
group_data = interactions.groupby(['contentId','eventType'])['activity_count'].sum().reset_index()
activity_data = group_data.pivot_table('activity_count', ['contentId'], 'eventType').fillna(0)

In [None]:
article_data.columns

In [None]:
articles_features = ['contentId', 'eventType', 'contentType', 'title', 'text', 'lang', 'tokens',
                     'unique_tokens', 'average_token_length', 'n_non_stop_unique_tokens',
                     'global_subjectivity', 'avg_positive_polarity', 'global_sentiment_polarity']

data = pd.merge(left=activity_data, 
                right=article_data[articles_features], 
                left_on='contentId', 
                right_on='contentId')

#### one-hot encoding to some featues

In [None]:
event_type_onehot = pd.get_dummies(data.eventType).replace({'CONTENT REMOVED': {1: -5}, 'CONTENT SHARED': {1: 5}})
event_type_onehot.columns = ['CONTENT_REMOVED', 'CONTENT_SHARED']

content_type_onehot = pd.get_dummies(data.contentType, prefix='CONTENT_TYPE')

lang_onehot = pd.get_dummies(data.lang)
lang_onehot.columns = ['LANGUAGE_EN', 'LANGUAGE_ES', 'LANGUAGE_JA', 'LANGUAGE_LA', 'LANGUAGE_PT']

In [None]:
data = data.drop(['eventType','contentType', 'lang'], axis=1)
data = pd.concat([data, event_type_onehot, content_type_onehot, lang_onehot], axis=1)

In [None]:
data.columns

#### Creating lables (Virality)

In [None]:
VIRALITY = {
    'VIEW': 1.0,
    'LIKE': 4.0, 
    'COMMENT CREATED': 10.0,
    'FOLLOW': 25.0,
    'BOOKMARK': 100.0
}

def create_virality_label(row):
    virality = 0
    for activity, value in VIRALITY.items():
        virality += value*row[activity]

    virality += row['CONTENT_REMOVED']
    virality += row['CONTENT_SHARED']
    return virality

data['VIRALITY'] = data.apply(lambda row: create_virality_label(row), axis=1)

In [None]:
data = data.drop(['contentId', 'title', 'text'], axis=1)

In [None]:
data.head()

### Data splitting

In [None]:
features = list(data.columns)

X = data[features[:-1]]
y = data[features[-1]]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=42)

#### Correlation between features

In [None]:
plt.figure(figsize=(16,12))
sns.heatmap(data.corr(), annot=True, fmt=".2f")

In [None]:
X_train.shape, X_test.shape

In [None]:
y_train.shape, y_test.shape

In [None]:
r2_scores, mae_scores, rmse_scores = dict(), dict(), dict()

def calculate_scores(regressor_name: str, predicted: List, actual: List):
    """Calculate and store RMSE, MAE and R2 score from prediction"""
    
    r2_scores[regressor_name] = r2_score(actual, predicted)
    mae_scores[regressor_name] = mean_absolute_error(actual, predicted)
    rmse_scores[regressor_name] = mean_squared_error(actual, predicted, squared = False)

### Model Building

#### Linear Regression

In [None]:
linear_regression = LinearRegression()
linear_regression.fit(X_train, y_train)

prediction_lr = linear_regression.predict(X_test)

calculate_scores('linear_regression', prediction_lr, y_test)

#### Lasso regression

In [None]:
lasso = Lasso(alpha=0.1)
lasso.fit(X_train, y_train)
prediction_lasso = lasso.predict(X_test)

calculate_scores('lasso', prediction_lasso, y_test)

#### Ridge regression

In [None]:
ridge_regression = RidgeCV(alphas = [0.001,0.1,1,5,10,100], 
                           scoring = 'neg_root_mean_squared_error', 
                           cv = None, 
                           store_cv_values = True)
ridge_regression.fit(X_train, y_train)
prediction_rr = ridge_regression.predict(X_test)

calculate_scores('ridge_regression', prediction_rr, y_test)

#### XGB regression

In [None]:
xgb_regression = XGBRegressor(random_state = 42)
params_xgb = {'max_depth': [5,20,50]}
gsc_xgb = GridSearchCV(estimator = xgb_regression, param_grid=params_xgb, cv=3, scoring='neg_root_mean_squared_error')
gsc_xgb_res = gsc_xgb.fit(X_train, y_train)

In [None]:
xgb_regression.max_depth = gsc_xgb_res.best_params_['max_depth']
boosters = ['gbtree','gblinear']

for booster_ in boosters:
    xgb_regression.booster = booster_
    xgb_regression.fit(X_train, y_train)

    prediction_xgb = xgb_regression.predict(X_test)
    calculate_scores('xgb_'+ booster_, prediction_xgb, y_test)

#### CatBoost Regression

In [None]:
cat_boost = CatBoostRegressor(verbose=0, random_state=42, eval_metric='RMSE')
cat_boost.fit(X_train, y_train)
prediction_cb = cat_boost.predict(X_test)

calculate_scores('cat_boost', prediction_cb, y_test)

### Model Evaluation

In [None]:
def plot_bar_graph(name: str, data: Dict) -> None:
    """Plot bar-graph between model and its metric score"""
    
    plt.figure(figsize=(13,8))
    ax = sns.barplot(x=list(data.keys()), y=list(data.values()), palette="Blues_d")
    ax.set_xlabel('Regressor Name')
    ax.set_ylabel(name)
    plt.show()

#### RMSE (Root Mean Squared Error)

In [None]:
plot_bar_graph('RMSE', rmse_scores)

#### MAE (Mean Absolute Error)

In [None]:
plot_bar_graph('MAE', mae_scores)

#### R2 (R - Squared)

In [None]:
plot_bar_graph('R2', r2_scores)

### Results:
```Linear regression``` with lowest ```RMSE``` and ```MAE``` performs well as compared to others and then Ridge and Lasso regression.<br>
Thus, simple model can also performs well as compared to more sophisticated models.