# INSY 670 Group Project 1

## 1. Predictive Model

## 1.1 Environment Setup

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler

## 1.2 Data Pre-processing

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5500 entries, 0 to 5499
Data columns (total 23 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Choice               5500 non-null   int64  
 1   A_follower_count     5500 non-null   int64  
 2   A_following_count    5500 non-null   int64  
 3   A_listed_count       5500 non-null   int64  
 4   A_mentions_received  5500 non-null   float64
 5   A_retweets_received  5500 non-null   float64
 6   A_mentions_sent      5500 non-null   float64
 7   A_retweets_sent      5500 non-null   float64
 8   A_posts              5500 non-null   float64
 9   A_network_feature_1  5500 non-null   int64  
 10  A_network_feature_2  5500 non-null   float64
 11  A_network_feature_3  5500 non-null   float64
 12  B_follower_count     5500 non-null   int64  
 13  B_following_count    5500 non-null   int64  
 14  B_listed_count       5500 non-null   int64  
 15  B_mentions_received  5500 non-null   f

In [4]:
df = pd.DataFrame(columns = ["A_B_follower_count","A_B_following_count","A_B_listed_count","A_B_mentions_received",
                "A_B_retweets_received","A_B_mentions_sent","A_B_retweets_sent","A_B_posts","A_B_network_feature_1",
                "A_B_network_feature_2","A_B_network_feature_3"])

In [5]:
for i in range(11):
    series = train.iloc[:,i+1]-train.iloc[:,i+1+11]
    df.iloc[:,i] = series
df['Choice'] = train['Choice']

In [6]:
df

Unnamed: 0,A_B_follower_count,A_B_following_count,A_B_listed_count,A_B_mentions_received,A_B_retweets_received,A_B_mentions_sent,A_B_retweets_sent,A_B_posts,A_B_network_feature_1,A_B_network_feature_2,A_B_network_feature_3,Choice
0,-34235,-29506,-1686,-14.846518,-3.883525,-8.103828,-0.231920,-6.626665,-64,90.969697,9438.106061,0
1,-17671,331,-1382,49.961485,16.854685,2.481652,0.546816,4.106299,206,-113.587704,-1601.149290,0
2,3688,733,-105,24.768949,9.201969,4.758317,0.490702,2.986516,92,58.594502,5722.563574,0
3,-19542,-17630,-276,-565.184032,-390.016375,-26.220532,-7.067053,-29.271279,-1756,-21.469296,-1299.678967,0
4,38035,-849,2460,127.252413,33.417223,21.117111,2.213765,19.298035,466,78.904293,840.220036,1
...,...,...,...,...,...,...,...,...,...,...,...,...
5495,-762629,-1185,-13787,-5351.839938,-4730.783795,54.126383,3.006633,86.017644,-20651,28.807054,2171.964468,0
5496,-682,-236,-3,1.139100,-0.205570,0.000000,0.000000,-1.018804,4,47.600000,-3554.800000,1
5497,7831,-1038,566,34.391680,18.740284,0.181576,-0.777790,1.940731,117,103.995098,1292.989740,0
5498,-57424,-37392,-21681,455.382230,415.318328,-1.388242,-1.244570,3.138261,1907,-230.511754,-6661.772353,0


## 1.3 Features Selection

In [7]:
X = df.iloc[:,:10]
y = df['Choice']

In [8]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X)

In [9]:
randomforest = RandomForestClassifier(random_state=42)
model = randomforest.fit(X_std,y)
importance = pd.DataFrame(list(zip(X.columns,model.feature_importances_)),columns=
             ['predictor','feature importance'])
print(importance.sort_values(by=['feature importance'],ascending = False).head(5))

               predictor  feature importance
2       A_B_listed_count            0.172667
0     A_B_follower_count            0.149952
8  A_B_network_feature_1            0.126989
4  A_B_retweets_received            0.105559
3  A_B_mentions_received            0.097317


In [10]:
df2 = df[['A_B_listed_count','A_B_follower_count','A_B_network_feature_1','A_B_retweets_received','A_B_mentions_received']]

## 1.4 Binary Classification

### 1.4.1 Train Test Split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.33,random_state=5)

### 1.4.2 Logistic Regression

In [12]:
# Run the Model
lr = LogisticRegression()
model_lr = lr.fit(X_std,y)

# Calculate the accuracy score
y_test_pred = model_lr.predict(X_test)
print('Logistic Regression Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Logistic Regression Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Logistic Regression Accuracy Score: 0.7371900826446282
Logistic Regression Confusion Matrix:
        pred:0  pred:1
true:0     631     255
true:1     222     707


### 1.4.3 Random Forest

In [13]:
# Run the Model
randomforest = RandomForestClassifier(random_state=42)
model_rf = randomforest.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_rf.predict(X_test)
print('Random Forest Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Random Forest Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Random Forest Accuracy Score: 0.7608815426997245
Random Forest Confusion Matrix:
        pred:0  pred:1
true:0     663     223
true:1     211     718


### 1.4.4 Gradient Boosting

In [14]:
# Run the Model
gbt = GradientBoostingClassifier(random_state=42)
model_gbt = gbt.fit(X_train,y_train)

# Calculate the accuracy score
y_test_pred = model_gbt.predict(X_test)
print('Gradient Boosting Accuracy Score:',accuracy_score(y_test,y_test_pred))

# Confusion Matrix
print('Gradient Boosting Confusion Matrix:')
print(pd.DataFrame(confusion_matrix(y_test,y_test_pred,labels=[0,1]),index=['true:0','true:1'],columns=['pred:0','pred:1']))

Gradient Boosting Accuracy Score: 0.7768595041322314
Gradient Boosting Confusion Matrix:
        pred:0  pred:1
true:0     679     207
true:1     198     731


According to accuracy score, we see that gradient boosting has the best performance.

## 1.5 Results Analysis

**1. Which factors are best predictors of influence?**<br>
According to the result of Random Forest's feature importance, we selected the top 5 factors: difference between A and B in 'listed_count','follower_count','network_feature_1','retweets_received' and 'mentions_received'.<br><br>

**2. Are there any surprises here?** <br>
We are not surprised by the result, because the more influential a person is on social media, the more interactive they become (mentions_received, retweets_received), the more popularity they gain (listed_count, follower_count).<br><br>

**3. How can a business use your model/results?** <br>
WIth our model, companies are able to find the most interactive and popular influencers among all users in Twitter to promote their products.