# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

from sklearn.model_selection import KFold
from sklearn.utils import resample
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

In [2]:
from sklearn import metrics
import warnings
warnings.filterwarnings('ignore')

In [3]:
# This is the function to create new columns from the description columns
interests = ['python', 'machine learning', 'deep learning', 'engineer','data science','artificial intelligence', 'nlp','cs','computers','follow','followback']
def new_column(string):
    df[string] = np.where(df['description'].str.lower().str.contains(string)==True,1,0)

In [4]:
df = pd.read_csv("Twitter_MOST_New.csv")

In [5]:
df.drop(['Unnamed: 0'],axis=1,inplace=True)

In [6]:
# Calling function to create new columns
for i in interests:
    new_column(i)

# Dropping irrelevant attributes

In [7]:
df.drop(['utc_offset'],axis=1,inplace=True)
df.url_count.fillna(0,inplace=True) #url count for users is 0 hence it is showing a null value, thats why replacing it with 0

In [8]:
# We have extracted information from the description columns hence not required.
df.drop(['description'],axis=1,inplace=True)

In [9]:
# These columns have either large number of unique Values or a single value for all the obeservations. 
df.drop(['username','user_id','protected'],axis=1,inplace=True)

In [10]:
# First converting them to 'object' type
df['geo_enabled'] = df['geo_enabled'].astype(str)
df['veified'] = df['veified'].astype(str)
df['contributors_enabled'] = df['contributors_enabled'].astype(str)
df['translation_enabled'] = df['translation_enabled'].astype(str)

In [11]:
# Mapping True with 1 and False with 0
df.geo_enabled.replace(['True','False'],[1, 0],inplace=True)
df.veified.replace(['True','False'],[1, 0],inplace=True)
df.contributors_enabled.replace(['True','False'],[1, 0],inplace=True)
df.translation_enabled.replace(['True','False'],[1, 0],inplace=True)

# Train-Test Split

In [12]:
X = df.drop(['user_follows_me'],axis=1)
y = df['user_follows_me']

In [13]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.30,shuffle=True,random_state=42)

# Upsampling

In [14]:
X = pd.concat([X_train, y_train], axis=1)

# separate minority and majority classes
not_following = X[X.user_follows_me==0]
following = X[X.user_follows_me==1]

# upsample minority
following_upsampled = resample(following,
                          replace=True, # sample with replacement
                          n_samples=len(not_following), # match number in majority class
                          random_state=42) # reproducible results

In [15]:
# combine majority and upsampled minority
upsampled = pd.concat([not_following, following_upsampled])

# check new class counts
##upsampled.user_follows_me.value_counts()
   

In [16]:
X_train = upsampled.drop(['user_follows_me'],axis=1)
y_train = upsampled.loc[:,['user_follows_me']]

# XGBoost Classification

In [17]:
XGB = XGBClassifier()
XGB.fit(X_train,y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [18]:
predictions_XGB = XGB.predict(X_test)

In [19]:
print(metrics.classification_report(y_test,predictions_XGB))

              precision    recall  f1-score   support

           0       0.89      0.71      0.79       504
           1       0.21      0.47      0.29        81

    accuracy                           0.68       585
   macro avg       0.55      0.59      0.54       585
weighted avg       0.80      0.68      0.72       585



In [20]:
# import pickle
# pickle.dump(XGB, open('final_prediction.pkl', 'wb'))