# Social Media Analytics Spring 2018 - Assignment #1

## Matt Barrett, Tim Lai, Christine Mulcahy, Elena Reynolds, and Brett Scroggins

## Part II: Finding influencers from Twitter

In [1]:
import pandas as pd
import numpy as np
import networkx
import math
from copy import deepcopy
import csv
import matplotlib.pyplot as plt

In [2]:
complete_df = pd.read_csv("completeDF.csv")

del complete_df['Unnamed: 0']
del complete_df['text']

tweets_df = complete_df.sort_values('screen_name', ascending = True)
tweets_df.reindex(tweets_df['screen_name'])

tweets_df.head()

Unnamed: 0,screen_name,follower_cnt,listed_cnt,OriginalAuthor,Type
127,02July1776,770.0,2.0,stealthygeek,RT
1762,040Brian,191.0,4.0,StefanMolyneux,RT
1486,080TFairchild,187.0,2.0,bellvisuals,RT
2512,08balet80,2807.0,11.0,08balet80,Original
4595,0Kultra,1803.0,28.0,Nonsensicole,RT


In [3]:
output_df = deepcopy(tweets_df)

del output_df['follower_cnt']
del output_df['listed_cnt']

output_df.to_csv('output_file.csv')

In [4]:
net_x = networkx.DiGraph()

In [5]:
retweets = {}

for tweet in tweets_df.iterrows():
    
    if tweet[1][4] == "RT":
        if tweet[1][3] in retweets.keys():
            retweets[tweet[1][3]] += 1
        else:
            retweets[tweet[1][3]] = 1
        
    net_x.add_edge(tweet[1][0], tweet[1][3])

In [6]:
influence_df = pd.DataFrame({'retweets':retweets,
                             'degree':net_x.degree(),
                             'betweenness':networkx.betweenness_centrality(net_x),
                             'closeness':networkx.closeness_centrality(net_x)
})

In [7]:
influence_df['retweets'].fillna(0, inplace=True)
influence_df.head()

Unnamed: 0,betweenness,closeness,degree,retweets
02July1776,0.0,0.000164,1,0.0
040Brian,0.0,0.000164,1,0.0
080TFairchild,0.0,0.000164,1,0.0
08balet80,0.0,0.0,2,0.0
0Kultra,0.0,0.000164,1,0.0


In [8]:
scaled_degree = influence_df['degree'] / influence_df['degree'].max()

In [9]:
influence_df['degree'] = scaled_degree
influence_df.head()

Unnamed: 0,betweenness,closeness,degree,retweets
02July1776,0.0,0.000164,0.005525,0.0
040Brian,0.0,0.000164,0.005525,0.0
080TFairchild,0.0,0.000164,0.005525,0.0
08balet80,0.0,0.0,0.01105,0.0
0Kultra,0.0,0.000164,0.005525,0.0


In [10]:
full_df = pd.merge(tweets_df, influence_df, left_on = 'screen_name', right_index = True)

full_df.set_index('screen_name', inplace = True)
full_df = full_df[~full_df.index.duplicated(keep='first')]

del full_df['OriginalAuthor']
del full_df['Type']

len(full_df)

6115

In [11]:
full_df['follower_cnt'] = full_df['follower_cnt'].apply(lambda x: (x - full_df['follower_cnt'].mean()) / full_df['follower_cnt'].std())
full_df['listed_cnt'] = full_df['listed_cnt'].apply(lambda x: (x - full_df['listed_cnt'].mean()) / full_df['listed_cnt'].std())
full_df['retweets'] = full_df['retweets'].apply(lambda x: (x - full_df['retweets'].mean()) / full_df['retweets'].std())

In [12]:
w = [.2,.4,.2,.2]

In [13]:
full_df['score'] = w[0]*full_df['retweets'] + \
                        w[1]*full_df['listed_cnt'] + \
                        w[2]*full_df['follower_cnt'] + \
                        w[3]*(full_df['degree']+full_df['betweenness']+full_df['closeness'])

In [14]:
full_df.head()

Unnamed: 0_level_0,follower_cnt,listed_cnt,betweenness,closeness,degree,retweets,score
screen_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
02July1776,-0.080401,-0.140795,0.0,0.000164,0.005525,-0.13029,-0.097319
040Brian,-0.089844,-0.137005,0.0,0.000164,0.005525,-0.13029,-0.097691
080TFairchild,-0.089909,-0.140795,0.0,0.000164,0.005525,-0.13029,-0.09922
08balet80,-0.047178,-0.123739,0.0,0.0,0.01105,-0.13029,-0.082779
0Kultra,-0.063553,-0.09152,0.0,0.000164,0.005525,-0.13029,-0.074239


In [15]:
full_df['score'].sort_values(ascending = False)[:10]

screen_name
MSNBC             21.980027
ACLU              15.680583
elizabethforma    15.440219
glamourmag        10.245322
NYDailyNews        9.787339
ajplus             9.646476
AC360              8.829026
tomcolicchio       8.575710
TomiLahren         7.457697
B75434425          6.691237
Name: score, dtype: float64