# SMA Assignment 1

### Group Members: Abhinav Singh, Catherine Miao, Eddie Eustachon, Elaine Wang, Thomas Bruce, Qinpei Zou

### Part I: Find predictors of influence

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.metrics import roc_curve, roc_auc_score, f1_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [2]:
import warnings
warnings.filterwarnings("ignore")

### 1. Read Data

In [3]:
df=pd.read_csv("train.csv",sep=',')

FileNotFoundError: [Errno 2] File b'train.csv' does not exist: b'train.csv'

In [None]:
df.head()

### 2. use correlation matrix to check feature correlations

### 2.1 Subset features with individual A 

In [None]:
df2=df[df.columns[1:12]] 

In [None]:
import seaborn as sns #from pandas.plotting import scatter_matrix
f, ax = plt.subplots(figsize=(10, 8))
corr = df2.corr()
ax = sns.heatmap(
    corr, 
    vmin=-1, vmax=1, center=0,
    cmap=sns.diverging_palette(20, 220, n=200),
    square=True,
    annot = True
)
ax.set_xticklabels(
    ax.get_xticklabels(),
    rotation=45,
    horizontalalignment='right'
);
corr

### Correlation matrix indicates: 
**we should drop network_feature_1, mentions_received since these two features are both highly correlated with retweets_received, i.e. cor(network_feature_1,retweets_received)=0.928, cor(mentions_received, retweets_received)=0.99.**

### 3. Feature Reduction & Transformation

In [None]:
df4=df[df.columns[1:23]]
df4.replace(0,0.00001,inplace=True)
df4.drop(['A_mentions_received', 'B_mentions_received','A_network_feature_1','B_network_feature_1'],\
axis=1, inplace=True)

In [None]:
df4['diff_follower_count']=df4['A_follower_count'] - df4['B_follower_count']
df4['diff_posts'] = df4['A_posts'] - df4['B_posts']
df4['diff_following_count']=df4['A_following_count'] - df4['B_following_count']
df4['diff_listed_count']=df4['A_listed_count'] - df4['B_listed_count']
df4['diff_retweets_received']=df4['A_retweets_received'] - df4['B_retweets_received']
df4['diff_mentions_sent']=df4['A_mentions_sent'] - df4['B_mentions_sent']
df4['diff_retweets_sent']=df4['A_retweets_sent'] - df4['B_retweets_sent']
df4['diff_posts']=df4['A_posts'] - df4['B_posts']
df4['diff_network_feature_3'] = df4['A_network_feature_3'] - df4['B_network_feature_3']
df4['diff_network_feature_2'] = df4['A_network_feature_2'] - df4['B_network_feature_2']
df4.head()

### 4. Model Predictions 

##### (1) Logistic Regression

In [None]:
X=df4.iloc[:,-9:]
Y=df['Choice'].values
X_train,X_test,y_train,y_test=train_test_split(X,Y,test_size=0.20,random_state=5)
scaler = StandardScaler()
scaler.fit(X)
X_scaled = pd.DataFrame(scaler.transform(X),columns = X.columns)

In [None]:
LR = LogisticRegression().fit(X_train, y_train)
y_pred_lr = LR.predict(X_test)
acc_lr = metrics.accuracy_score(y_test, y_pred_lr)
c_lr=confusion_matrix(y_test, y_pred_lr)
print('Predict accuracy of Logistic Regression is ', acc_lr)
labels = ['A<B', 'A>B']
sns.heatmap(c_lr,annot = True, xticklabels=labels, yticklabels=labels, fmt='d', cmap="Blues", vmin = 0.1);
plt.title('Confusion Matrix for logistic regression')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()
c_lr

##### (2) Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(n_estimators = 200)
rf.fit(X_train, y_train)
y_pred_rf = rf.predict(X_test)
acc_rf = metrics.accuracy_score(y_test, y_pred_rf)
c_rf=confusion_matrix(y_test, y_pred_rf)
print('Predict accuracy of Random Forest is ', acc_rf)
labels = ['A<B', 'A>B']
sns.heatmap(c_rf,annot = True, xticklabels=labels, yticklabels=labels, fmt='d', cmap="Blues", vmin = 0.1);
plt.title('Confusion Matrix for Random Forest')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()
c_rf

### Obtain Feature Importance with Random Forest 

In [None]:
feature_importances = pd.DataFrame(rf.feature_importances_,
                                   index = X_train.columns,
                                    columns=['importance']).sort_values('importance', ascending=False)
feature_importances

In [None]:
feature_importances[:5].sort_values('importance').plot.barh()

**From the random forest model, we got 79.64% predict accuracy in out-of sample data. Also, from the feature importance above, the top three most important features are:**

#### 1. diff_listed_count 
#### 2. diff_follower_count
#### 3. diff_retweets_received

##### (3) XGboost

In [None]:
!pip install xgboost

In [None]:
import xgboost as xgb
xgb=xgb.XGBClassifier(random_state=1,learning_rate=0.01)
xgb.fit(X_train, y_train)
y_pred_xgb = xgb.predict(X_test)
acc_xgb = metrics.accuracy_score(y_test, y_pred_xgb)
c_xgb=confusion_matrix(y_test, y_pred_xgb)
print('Predict accuracy of Random Forest is ', acc_xgb)
labels = ['A<B', 'A>B']
sns.heatmap(c_xgb, annot = True, xticklabels=labels, yticklabels=labels, fmt='d', cmap="Blues", vmin = 0.1);
plt.title('Confusion Matrix for XGboost')
plt.ylabel('True Class')
plt.xlabel('Predicted Class')
plt.show()
c_xgb

#### Therefore, both the Random Forest and the XGboost model achieve an accuracy ~ 79%. 

**From the random forest model, we determine that diff_listed_count, diff_follower_count, diff_retweets_received are three important predictors of influencers.**

**Diff_listed_count** is the most important variable which accounts for about 21%. Listed_count measures that the number of people that are in this users' list. A List is a curated group of Twitter accounts. The higher listed_count, the more people follow with interest.

**Diff_follower_count** is the second important variable which accounts for 16.6%. Follow_count is the number of followers this account currently has. So diff_follower_count shows how many more followers  A have than B. The more diff_follower_count, the more likely that A's influence power is higher than B's.

**Diff_retweets_received** is the third important variable which accouts for 15.1%. Retweets_received is the number of retweets this user received. The more retweets he/she received, the more people pay attention to the tweets he/she sent. So diff_retweets_received is another good predictor to measure the influencial difference between A and B.


### Business Implication of our model: 

If a business is having a marketing promotion and in need of allocating limited funding to pay selected media influencers, then this model would help their decision making on who is the better influencer i.e. which media influencer will bring higher profit for the business, and thus the individual to pay. 

In addition, it is a great model for businesses to strategize their advertsing campaign. For example, when new product is launched, the busienss needs to advertise the new products on social media platform. It would be an excellent way to employ this predictive model so as to target right influencers, and in turn, make the advertising campaign more effective.

### Calculate the financial value of our model

**Without analytics:**

Profit_1 = 10 ✖ 0.01% ✖ (A_followers_count + B_followers_count) - 5 ✖ (# of A + # of B)

**With analytics:**

Profit_2 = 10 ✖ 0.015% ✖ prediction accuracy ✖  Influencers_followers_count - 10 ✖  # of Influencer

**Lift in net profit:**
Profit_2 - Profit_1

**Percentage lift in net profit:** 

(Profit_2 - Profit_1)/Profit_1x100%

**Since without analytics, the retailer will let everyone (A+B) tweet once. so the profit_1 will be: (under assumption: Each user appears only once in the data)**

In [None]:
# When A is influencer, the choice is 1.
# A_influencer_follower calculates the followers of A if A is an influencer
A_influencer_follower = sum(df['A_follower_count'][df['Choice']==1])

# the same for B
B_influencer_follower = sum(df['B_follower_count'][df['Choice']==0])


In [None]:
profit_1 = 10*0.0001*(A_influencer_follower+B_influencer_follower) - 5*(len(df)+len(df))
print('Profit without analytics : ', round(profit_1,2) )

**With analytics, the retailer will only target influencer and our model accuracy is 79.45%. Also, the retailer only target at A or B. The profit_2 will be:** 

In [None]:
profit_2 = 10*0.00015*0.7945*(A_influencer_follower+B_influencer_follower) - 10*(len(df))
print('Profit with analytics : ', round(profit_2,2) )

In [None]:
lift_profit = profit_2 - profit_1
print('Lift in profit : ', round(lift_profit,2))

lift_profit = (profit_2 - profit_1)/profit_1
print('%Lift in profit : ', round(100*lift_profit,1),"%")

**If using a perfect analytic model which is the predict accuracy is 100%, the lift in net profit will be**

In [None]:
profit_perfect = 10*0.00015*(A_influencer_follower+B_influencer_follower) - 10*(len(df))
lift_profit_perfect = profit_perfect - profit_1
print('Lift in net profit from the perfect analytic model : ', round(lift_profit_perfect,2))
print('%Lift in net profit from the perfect analytic model : ', round(100*lift_profit_perfect/profit_1,1),"%")

### In conclusion, using our analytic model above would increase profit for the retailer by around 20% compared to not using analytics, while using a perfect model would increase profit by around 50%. Therefore, we determine that the financial value of analytics is substantial 