In [9]:
import pandas as pd
import re
pd.options.mode.chained_assignment = None  # default='warn'


In [2]:
data=pd.read_csv('../week3/Political-media-DFE.csv', encoding='latin1')

In [3]:
data.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'audience', 'audience:confidence', 'bias',
       'bias:confidence', 'message', 'message:confidence', 'orig__golden',
       'audience_gold', 'bias_gold', 'bioid', 'embed', 'id', 'label',
       'message_gold', 'source', 'text'],
      dtype='object')

> * Subset the DataFrame to have `label`, `source`, and `text` columns

In [4]:
df=data[['label', 'source', 'text']]

In [5]:
df.shape

(5000, 3)

> * Q. How many unique users are there in the dataset? (check the `label` column)

In [6]:
df['label'].nunique()

505

> * Let's print the first 5 rows of the DataFrame

In [7]:
df.head(5)

Unnamed: 0,label,source,text
0,From: Trey Radel (Representative from Florida),twitter,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,From: Mitch McConnell (Senator from Kentucky),twitter,VIDEO - #Obamacare: Full of Higher Costs and ...
2,From: Kurt Schrader (Representative from Oregon),twitter,Please join me today in remembering our fallen...
3,From: Michael Crapo (Senator from Idaho),twitter,RT @SenatorLeahy: 1st step toward Senate debat...
4,From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...


> * Let's extract the name of the user from the `label` column and create a new column called `username` in the DataFrame

In [10]:
pattern=re.compile(r'(.+?)\s*\(')
df['username']=df['label'].str.replace('From: ', '').str.findall(pattern).str[0]

>* Let's convert the usernames to lowercase

In [11]:
df['username']=df['username'].str.lower()

In [12]:
df['username']

0            trey radel
1       mitch mcconnell
2         kurt schrader
3         michael crapo
4            mark udall
             ...       
4995           ted yoho
4996           ted yoho
4997           ted yoho
4998           ted yoho
4999           ted yoho
Name: username, Length: 5000, dtype: object

In [13]:
df.reset_index(drop=True, inplace=True)

> * Let's extract the mentions from the `text` column and create a new column `mention` with the mentions.

In [61]:
pattern = re.compile(r"@[a-zA-Z0-9]+")
df['mention']=df['text'].apply(lambda x: pattern.findall(x))
df['mention']=df['mention'].apply(lambda x: [y.lower() for y in x])

>* We also don't want to forget to convert the text into lowercase.

In [62]:
df['text']=df['text'].str.lower()

In [63]:
df.head(5)

Unnamed: 0,label,source,text,username,mention,title,state
0,From: Trey Radel (Representative from Florida),twitter,rt @nowthisnews: rep. trey radel (r- #fl) slam...,trey radel,[@nowthisnews],Representative,Florida
1,From: Mitch McConnell (Senator from Kentucky),twitter,video - #obamacare: full of higher costs and ...,mitch mcconnell,[],Senator,Kentucky
2,From: Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,kurt schrader,[],Representative,Oregon
3,From: Michael Crapo (Senator from Idaho),twitter,rt @senatorleahy: 1st step toward senate debat...,michael crapo,[@senatorleahy],Senator,Idaho
4,From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...,mark udall,[@amazon],Senator,Colorado


> * Let's build edges between the users who have mentioned.
> * To do so, we will use the `mentions` column and iterate over the rows to create edges between the users in the `mention` column.

In [64]:
#first we need to iterate through the mention column and then we need to iterate through usernames column
#and we need to create a tuple of the username and the mention

edges=[]
for idx, val in df.iterrows():
    if len(val['mention']) == 0: #when there are no mentions, we skip the iteration
        continue
    elif len(val['mention']) > 0: #when there are mentions, we iterate through the mentions
        for mention in val['mention']: 
            edges.append((val['username'].lower(), mention.strip('@').lower())) 
            #we append the tuple of the username and the mention to the edges list
            #lower() is used to convert the username to lowercase
            #strip() is used to remove the @ from the mention

In [65]:
edges[:10]

[('trey radel', 'nowthisnews'),
 ('michael crapo', 'senatorleahy'),
 ('mark udall', 'amazon'),
 ('heidi heitkamp', 'usdotfra'),
 ('frederica wilson', 'bbcworld'),
 ('gregg harper', 'mha'),
 ('gregg harper', 'genevrapittman'),
 ('gregg harper', 'medcitynews'),
 ('john dingell', 'skitchp'),
 ('kirsten gillibrand', 'corybooker')]

In [66]:
len(edges)

1843

In [67]:
edges[:10]

[('trey radel', 'nowthisnews'),
 ('michael crapo', 'senatorleahy'),
 ('mark udall', 'amazon'),
 ('heidi heitkamp', 'usdotfra'),
 ('frederica wilson', 'bbcworld'),
 ('gregg harper', 'mha'),
 ('gregg harper', 'genevrapittman'),
 ('gregg harper', 'medcitynews'),
 ('john dingell', 'skitchp'),
 ('kirsten gillibrand', 'corybooker')]

> * Let's count the degree centrality of each user and create a dictionary called `degree` for degree centrality.

>* First, you iterate through the unique users and count the number of edges that are connected to the user.

In [68]:
degree={}
for element in df['username'].unique():
    count = 0
    for edge in edges:
        if element in edge:
            count += 1
    degree[element]=count


>* Let's check whether it captured the degree by looking at 'trey radel' instance.

In [69]:
degree['trey radel']

5

In [70]:
for edge in edges:
    if 'trey radel' in edge:
        print(edge)

('trey radel', 'nowthisnews')
('trey radel', 'markdotdo')
('trey radel', 'treyradel')
('trey radel', 'morrow')
('trey radel', 'redmillennial')


> * We have five degree for 'trey radel'. Let's divide the degree by the number of unique users -1 to calculate degree centrality.

<img src="../week5/degree-centrality.png" width=1000px height=200px />

> * How do we want to find the number of nodes in the graph?

In [71]:
#first, we try the number of unique usernames in df.T
degree['trey radel']/(len(df['username'].unique())-1)

0.009940357852882704

> * But this is incorrect answer. Why?

In [72]:
import networkx as nx
G=nx.Graph()
G.add_edges_from(edges)


In [73]:
nx.degree_centrality(G)['trey radel']

0.0027731558513588465

> * As we talked before, the number of nodes in the graph will be larger than the number of unique users. Because we have to consider the users who have mentioned in the dataset but not in the `label` column.

>* So, we have to find the number of unique users in the edge list. 

In [74]:
degree['trey radel']/(len(set([element for tuple in edges for element in tuple]))-1)

0.0027731558513588465

> * Now we know how to calculate the degree centrality. Let's iterate through the `degree` dictionary and divide the degree by the number of unique users in the edge list.

In [75]:
for key, val in degree.items():
    degree[key]=val/(len(set([element for tuple in edges for element in tuple]))-1)

In [76]:
degree['trey radel']

0.0027731558513588465

>* Q. What can be other attributes for the nodes?

>* There is an information about whether the political leader is a Representative or a Senator.
>* Let's use RegEx to extract that and create a dictionary called `title`.

In [77]:
df['label']

0         From: Trey Radel (Representative from Florida)
1          From: Mitch McConnell (Senator from Kentucky)
2       From: Kurt Schrader (Representative from Oregon)
3               From: Michael Crapo (Senator from Idaho)
4               From: Mark Udall (Senator from Colorado)
                              ...                       
4995        From: Ted Yoho (Representative from Florida)
4996        From: Ted Yoho (Representative from Florida)
4997        From: Ted Yoho (Representative from Florida)
4998        From: Ted Yoho (Representative from Florida)
4999        From: Ted Yoho (Representative from Florida)
Name: label, Length: 5000, dtype: object

In [78]:
pattern=re.compile("Representative|Senator")
df['title']=df['label'].str.findall(pattern).str[0].str.strip('(')

In [79]:
df['title'].value_counts()

title
Representative    4025
Senator            975
Name: count, dtype: int64

In [80]:
title={}
for idx, row in df.iterrows():
    title[row['username']]=row['title']

>* There is another information about which State the political leader is from.
>* Let's use RegEx to extract that and create a dictionary called `state`.

>* We are changing the value of Representative and Senator to 'o' and 'x' to use it as a node shape.

In [81]:
title_shape = {key: 'o' if value == 'Representative' else 'x' for key, value in title.items()}

In [82]:
pattern=re.compile(r'from\s.*')
df['state']=df['label'].str.findall(pattern).str[0].str.replace('from','').str.replace(')', '').str.strip()

In [83]:
state={}
for idx, row in df.iterrows():
    state[row['username']]=row['state']


In [84]:
df['state'].value_counts()[:10]

state
Texas             494
California        486
Florida           298
New York          224
Ohio              183
Illinois          183
Pennsylvania      177
Arizona           144
New Jersey        137
North Carolina    134
Name: count, dtype: int64

In [85]:
import random
import seaborn as sns
# Get the unique values from the state dictionary
unique_states = list(set(state.values()))

# Generate a color palette using seaborn
color_palette = sns.color_palette("hls", len(unique_states))

# Create a dictionary to map each unique state to a color
state_colors_dict = {key: color_palette[i] for i, key in enumerate(unique_states)}


In [86]:
color_palette

In [87]:
state_colors_dict['Florida']

(0.86, 0.33999999999999997, 0.7982117647058823)

In [88]:
df[df['username']=='trey radel']

Unnamed: 0,label,source,text,username,mention,title,state
0,From: Trey Radel (Representative from Florida),twitter,rt @nowthisnews: rep. trey radel (r- #fl) slam...,trey radel,[@nowthisnews],Representative,Florida
625,From: Trey Radel (Representative from Florida),twitter,ûï@markdotdo: @treyradel dig the new tumblr t...,trey radel,"[@markdotdo, @treyradel]",Representative,Florida
707,From: Trey Radel (Representative from Florida),twitter,@morrow_brett ha! ur the man! u text one night...,trey radel,[@morrow],Representative,Florida
959,From: Trey Radel (Representative from Florida),twitter,why does keeping troops out of #syria matter? ...,trey radel,[],Representative,Florida
1768,From: Trey Radel (Representative from Florida),twitter,#obamacareinthreewords - protected privacy? lol,trey radel,[],Representative,Florida
1885,From: Trey Radel (Representative from Florida),twitter,now following @redmillennial hoping to involve...,trey radel,[@redmillennial],Representative,Florida
2422,From: Trey Radel (Representative from Florida),twitter,transportation sec. announced faa can transfer...,trey radel,[],Representative,Florida


In [89]:
from collections import defaultdict
default_dict=defaultdict(lambda: None, state)
for key, value in state.items():
    if value in state_colors_dict.keys():
        default_dict[key]=state_colors_dict.get(value)
color_mapped=dict(default_dict)

In [90]:
color_mapped['trey radel']

(0.86, 0.33999999999999997, 0.7982117647058823)

>* Practice

In [91]:
data=pd.read_csv('subset-2021-01-11-voter_fraud.csv')

>* Let's see what it has for column names.

In [50]:
data.columns

Index(['article', 'body', 'bodywithurls', 'comments', 'createdAt',
       'createdAtformatted', 'creator', 'datatype', 'depth', 'depthRaw',
       ...
       'urls.15.createdAt', 'urls.15.domain', 'urls.15.id', 'urls.15.long',
       'urls.15.metadata.length', 'urls.15.metadata.mimeType',
       'urls.15.metadata.site', 'urls.15.modified', 'urls.15.short',
       'urls.15.state'],
      dtype='object', length=420)

>* Let's subset the data to have only `bodywithurls`, `username`, `followers`, and `following` columns.

In [51]:
df_fraud = data[["bodywithurls", "username", "followers", "following"]]

>* Q. How many unique users are there in the dataset?

In [52]:
df_fraud.nunique()

bodywithurls    65
username        58
followers       49
following       52
dtype: int64

>* Q. Print the first 5 rows of the DataFrame.

In [53]:
df_fraud.head(5)

Unnamed: 0,bodywithurls,username,followers,following
0,Folks may be our last posts. Pogilosi just ask...,Terryb158,3500,5500
1,Oh notre they want to investigate something. L...,AlanBond7,100,243
2,White trash pieces of shit who are going to be...,Tifdog11,37,42
3,He has never lived in a totalitarian country; ...,AlexaImmigrant,0,9
4,White trash pieces of shit who are going to be...,Tifdog11,37,42


>* Q. In the result of the first 5 rows of the DataFrame, do you see any duplicates?

Yes there are duplicates

> * Looks like there are duplicates in the dataset. Let's remove the duplicates.

In [55]:
df_fraud_new = df_fraud.drop_duplicates()

> * Q. How many rows and columns are there in the dataset after removing the duplicates?

In [59]:
df_fraud_new.columns

df_fraud_new.nunique()

bodywithurls    65
username        58
followers       49
following       52
dtype: int64

>* Once we remove the duplicates, let's reset the index of the DataFrame.

In [130]:
df_fraud_new.reset_index(drop=True)

Unnamed: 0,bodywithurls,username,followers,following,mentions
0,Folks may be our last posts. Pogilosi just ask...,Terryb158,3500,5500,[]
1,Oh notre they want to investigate something. L...,AlanBond7,100,243,[]
2,White trash pieces of shit who are going to be...,Tifdog11,37,42,[]
3,He has never lived in a totalitarian country; ...,AlexaImmigrant,0,9,[]
4,@sidneypowell @erictrump\r\n@laraleatrump @lin...,Lovewell100,48,148,"[@sidneypowell, @erictrump, @laraleatrump, @li..."
...,...,...,...,...,...
60,Oh now they want to investigate something. Let...,AlanBond7,100,243,[]
61,GRAHAM\r\nHe begged us for money on Hannity an...,Millsfarms,3,23,[]
62,Neither is election fraud.\r\n,xfitnesscoach,7,44,[]
63,"Investigation of fraudulent voting practices, ...",Bobbfishen,147,100,[]


> * Let's extract the mentions from the `bodywithurls` column and create a new column `mentions` with the mentions.

In [131]:
#YOUR CODE HERE
pattern = re.compile(r"@[a-zA-Z0-9]+")
df_fraud_new["mentions"] = df_fraud_new["bodywithurls"].apply(lambda x: pattern.findall(x))
df_fraud_new["mentions"] = df_fraud_new["mentions"].apply(lambda x: [y.lower() for y in x])
df_fraud_new["mentions"]


0                                                    []
1                                                    []
2                                                    []
3                                                    []
5     [@sidneypowell, @erictrump, @laraleatrump, @li...
                            ...                        
85                                                   []
86                                                   []
87                                                   []
88                                                   []
89                                                   []
Name: mentions, Length: 65, dtype: object

>* Q. Which account has mentioned the most number of users?

In [140]:
var = df_fraud_new["mentions"].apply(lambda x: len(x)).sort_values(ascending=False).index[0]
var
df_fraud_new.iloc[50]
df_fraud_new.iloc[51]
df_fraud_new.iloc[1]

bodywithurls    Oh notre they want to investigate something. L...
username                                                AlanBond7
followers                                                     100
following                                                     243
mentions                                                       []
Name: 1, dtype: object

>* Q. Who are mentioned by that account?

In [133]:
#df_fraud_new["bodywithurls"].iloc[df_fraud_new["mentions"].apply(lambda x: len(x)).sort_values(ascending=False)].index[0]
#df_fraud_new[df_fraud_new["username"] == name]
df_fraud_new.iloc[var]

bodywithurls    Also Dan i am sure you are not aware of this.....
username                                         ihavehadenough43
followers                                                     293
following                                                     551
mentions                                                       []
Name: 73, dtype: object

> * Let's build edges between the users who have mentioned.
> * To do so, we will use the `mentions` column and iterate over the rows to create edges between the users in the `mention` column.

In [None]:
#YOUR CODE HERE

>* How many edges are there in the data?

In [None]:
#YOUR CODE HERE