>**Problem Statement:** An online question and answer platform has hired you as a data scientist to identify the best question authors on the platform. This identification will bring more insight into increasing the user engagement. Given the tag of the question, number of views received, number of answers, username and reputation of the question author, the problem requires you to predict the upvote count that the question will receive.

# With log transformation

In [49]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import skew
from scipy.stats import kurtosis

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
from xgboost import XGBRegressor
from sklearn.preprocessing import LabelEncoder

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, LeakyReLU, PReLU, ELU, Dropout
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers
from sklearn import linear_model

from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler

In [50]:
df_train = pd.read_csv("train_NIR5Yl1.csv")
df_test = pd.read_csv("test_8i3B3FC.csv")

In [51]:
# df_train.Upvotes.plot(kind='kde')

In [52]:
# independent variables
# df_train.Reputation.plot(kind='kde')

In [53]:
df_train.columns

# it seems it has too many values out side the box

Index(['ID', 'Tag', 'Reputation', 'Answers', 'Username', 'Views', 'Upvotes'], dtype='object')

### Applying log transformation on skewed data

In [54]:
mm = MinMaxScaler()

In [55]:
df_train.Reputation = mm.fit_transform(df_train[['Reputation']])
df_test.Reputation = mm.transform(df_test[['Reputation']])

In [56]:
df_train.Views = mm.fit_transform(df_train[['Views']])
df_test.Views = mm.transform(df_test[['Views']])

In [57]:
df_train.Answers = mm.fit_transform(df_train[['Answers']])
df_test.Answers = mm.transform(df_test[['Answers']])

In [58]:
df_train.Reputation = np.log1p(df_train.Reputation)
df_test.Reputation = np.log1p(df_test.Reputation)

In [59]:
df_train.Answers = np.log1p(df_train.Answers)
df_test.Answers = np.log1p(df_test.Answers)

In [60]:
df_train.Views = np.log1p(df_train.Views)
df_test.Views = np.log1p(df_test.Views)

In [61]:
df_train.Upvotes = np.log1p(df_train.Upvotes)

In [62]:
df_train.columns

Index(['ID', 'Tag', 'Reputation', 'Answers', 'Username', 'Views', 'Upvotes'], dtype='object')

### After transformation

In [63]:
# df_train.Reputation.plot(kind='kde')

In [64]:
df_train.drop(columns=['Username','ID'], inplace=True)
df_test.drop(columns=['Username','ID'],inplace=True)



In [65]:
# df_train = pd.get_dummies(df_train, columns=['Tag'], drop_first=True)
# df_test = pd.get_dummies(df_test, columns=['Tag'], drop_first=True)
le = LabelEncoder()
df_train.Tag = le.fit_transform(df_train['Tag'])
df_test.Tag = le.transform(df_test['Tag'])

In [66]:
X = df_train.drop(columns=['Upvotes'])
y = df_train.Upvotes

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [68]:
X_train.shape

(264036, 4)

In [69]:
clf = linear_model.Lasso(alpha=0.0000001)

In [70]:
clf.fit(X_train,y_train)

Lasso(alpha=1e-07)

In [71]:
y_pred = clf.predict(X_test)

In [72]:
r2_score(y_test,y_pred)

0.43651308725936644

In [73]:
y_pred

array([2.93303811, 4.25761458, 3.2903662 , ..., 3.57784642, 3.57552065,
       4.38832904])

In [74]:
mean_squared_error(y_test,y_pred)

1.9125326684573676

In [75]:
X_test

Unnamed: 0,Tag,Reputation,Answers,Views
101695,6,0.000017,0.038715,0.001061
260437,9,0.000472,0.135175,0.007712
207112,4,0.000322,0.038715,0.007906
319275,1,0.000160,0.063716,0.027091
157589,0,0.000338,0.051293,0.000386
...,...,...,...,...
72320,1,0.000170,0.013072,0.000127
16363,6,0.000445,0.038715,0.000299
43714,1,0.000728,0.075986,0.001329
306294,1,0.000719,0.051293,0.008433


In [82]:
xgb = XGBRegressor(base_score=0.6, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=3, missing=np.nan, monotone_constraints='()',
             n_estimators=600, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [83]:
xgb.fit(X_train,y_train)

XGBRegressor(base_score=0.6, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.1, max_delta_step=0, max_depth=2,
             min_child_weight=3, missing=nan, monotone_constraints='()',
             n_estimators=600, n_jobs=8, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [84]:
pred = xgb.predict(X_test)

In [85]:
r2_score(y_test,pred)

0.8458396975758022

In [86]:
mean_squared_error(y_test,pred)

0.5232359579241119