### Imports and data loading

In [32]:
# packages for data manipulation
import numpy as np
import pandas as pd
from scipy import stats
# packages for data visualization
import seaborn as sns
import matplotlib.pyplot as plt
# packages for data modeling
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from xgboost import plot_importance

In [20]:
# load dataset into dataframe
data = pd.read_csv("tiktok_dataset.csv")

### Examine data, summary info, and descriptive stats

In [None]:
# see data
data.head(10)

In [None]:
# see data shape
print(data.shape)

In [None]:
# get data types of variables
print(data.dtypes)

In [None]:
# get basic information about variables
data.info()

In [None]:
# get basic descriptive stats
data.describe()

In [None]:
# check for NA
data.isna().sum()

In [27]:
# fix all NA
data = data.dropna(axis=0)

In [None]:
# visualize
data.head(10)

In [None]:
# check for duplicate entries
data.duplicated().sum()

In [None]:
# check class balance
data['claim_status'].value_counts(normalize=True)

### Feature engineering

In [None]:
# createing "transcription_len"
data['transcription_len'] = data['video_transcription_text'].str.len()
data.head(10)

In [None]:
# seeing if there's a difference between 'claim' and 'opinino' length
data[['transcription_len','claim_status']].groupby('claim_status').mean()

In [None]:
# visualize distribution of `transcription_len` for claims and opinions
sns.histplot(data=data,
             stat='count',
             multiple='dodge',
             x='transcription_len',
             hue='claim_status')
plt.xlabel("Transcription Length (# char)")
plt.ylabel("Count")
plt.title("Transcription Length for Claims and Opinions")
plt.show()

In [None]:
# fixing other categorical variables
X = data.copy()
X = X.drop(['#','video_id'],axis=1)
X['claim_status'] = X['claim_status'].replace({'opinion':0,'claim':1})
X = pd.get_dummies(X,
                   columns=['verified_status','author_ban_status'],
                   drop_first=True)
X.head(10)

### Clean data

In [57]:
# target variable
y = X['claim_status']

In [None]:
# drop useless columns
X = X.drop(['claim_status','video_transcription_text'],axis=1)
X.head(10)

####  Create train/validate/test sets

In [59]:
# split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=7)

In [60]:
# split train data into training and validation sets
X_train, X_value, y_train, y_value = train_test_split(X_train,y_train,test_size=0.25,random_state=7)

In [None]:
# see shapes for all train, validate, test set
X_train.shape, X_test.shape, X_value.shape, y_train.shape, y_test.shape, y_value.shape

### Build models


In [65]:
# building random forest model
randf = RandomForestClassifier(random_state=7)
hyperparam = {'max_depth':[5,7,None],
              'n_estimators':[75,100,200],
              'max_features':[0.3,0.6],
              'max_samples':[0.7],
              'min_samples_leaf':[1,2],
              'min_samples_split':[2,3]}
scores = {'accuracy','precision','recall','f1'}
grid_obj = GridSearchCV(randf,hyperparam,scoring=scores,cv=5,refit='recall')

In [None]:
# fit random forest model
grid_obj.fit(X_train,y_train)

In [None]:
# seeing recall score
grid_obj.best_score_

In [None]:
# handling results from random forest classifier
grid_results = pd.DataFrame(grid_obj.cv_results_)
best_precision_row = grid_results[grid_results['mean_test_precision'] == grid_results['mean_test_precision'].max()]
print(best_precision_row)

In [None]:
# finding best hyperparameters
grid_obj.best_params_

In [73]:
# building XGboost model
boost = XGBClassifier(objective='binary:logistic',random_state=7)
boost_param = {'max_depth':[4,8,12],
               'min_child_weight':[3,5],
               'learning_rate':[0.01,0.1],
               'n_estimators':[300,500]}
boost_scores = {'accuracy','precision','recall','f1'}
boost_obj = GridSearchCV(boost,boost_param,scoring=boost_scores,cv=5,refit='recall')

In [None]:
# fit boost model
boost_obj.fit(X_train,y_train)

In [None]:
# boost result handling
boost_results = pd.DataFrame(boost_obj.cv_results_)
boost_bpr = boost_results[boost_results['mean_test_precision'] == boost_results['mean_test_precision'].max()]
print(boost_bpr)

### Evaluating models

In [None]:
# get predictions for random forest

y_pred = grid_obj.best_estimator_.predict(X_value)

In [None]:
# display
y_pred

In [None]:
# display actual labels from testing set
y_test

In [None]:
# create a confusion matrix to visualize the results of the classification model
cf_mat = confusion_matrix(y_value,y_pred)
cf_mat_display = ConfusionMatrixDisplay(confusion_matrix=cf_mat,display_labels=None)
cf_mat_display.plot()
plt.show()

In [None]:
# create a classification report
tgt_lab = ['opinion','claim']
print(classification_report(y_value,y_pred,target_names=tgt_lab))

In [None]:
# getting best estimators for boost model
y_pred = boost_obj.best_estimator_.predict(X_test)

In [None]:
# compute values for confusion matrix
boost_cf = confusion_matrix(y_test,y_pred)
boost_disp = ConfusionMatrixDisplay(confusion_matrix=boost_cf,display_labels=None)
boost_disp.plot()
plt.show()

In [None]:
# create a classification report
tgt_lab = ['opinion','claim']
print(classification_report(y_value,y_pred,target_names=tgt_lab))