In [1]:
import pandas as pd
import re

In [2]:
data=pd.read_csv('../week3/Political-media-DFE.csv', encoding='latin1')

In [3]:
data.columns

Index(['_unit_id', '_golden', '_unit_state', '_trusted_judgments',
       '_last_judgment_at', 'audience', 'audience:confidence', 'bias',
       'bias:confidence', 'message', 'message:confidence', 'orig__golden',
       'audience_gold', 'bias_gold', 'bioid', 'embed', 'id', 'label',
       'message_gold', 'source', 'text'],
      dtype='object')

> * Subset the DataFrame to have `label`, `source`, and `text` columns

In [4]:
df=data[['label', 'source', 'text']]

In [5]:
df.shape

(5000, 3)

> * Q. How many unique users are there in the dataset? (check the `label` column)

In [6]:
df['label'].nunique()

505

> * Let's print the first 5 rows of the DataFrame

In [7]:
df.head(5)

Unnamed: 0,label,source,text
0,From: Trey Radel (Representative from Florida),twitter,RT @nowthisnews: Rep. Trey Radel (R- #FL) slam...
1,From: Mitch McConnell (Senator from Kentucky),twitter,VIDEO - #Obamacare: Full of Higher Costs and ...
2,From: Kurt Schrader (Representative from Oregon),twitter,Please join me today in remembering our fallen...
3,From: Michael Crapo (Senator from Idaho),twitter,RT @SenatorLeahy: 1st step toward Senate debat...
4,From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...


> * Let's extract the name of the user from the `label` column and create a new column called `username` in the DataFrame

In [8]:
pattern=re.compile(r'(.+?)\s*\(')
df['username']=df['label'].str.replace('From: ', '').str.findall(pattern).str[0]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['username']=df['label'].str.replace('From: ', '').str.findall(pattern).str[0]


>* Let's convert the usernames to lowercase

In [9]:
df['username']=df['username'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['username']=df['username'].str.lower()


In [10]:
df['username']

0            trey radel
1       mitch mcconnell
2         kurt schrader
3         michael crapo
4            mark udall
             ...       
4995           ted yoho
4996           ted yoho
4997           ted yoho
4998           ted yoho
4999           ted yoho
Name: username, Length: 5000, dtype: object

In [11]:
df.reset_index(drop=True, inplace=True)

> * Let's extract the mentions from the `text` column and create a new column `mention` with the mentions.

In [12]:
pattern = re.compile(r'@[a-zA-Z0-9]+')
df['mention']=df['text'].apply(lambda x: pattern.findall(x))
df['mention']=df['mention'].apply(lambda x: [y.lower() for y in x])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mention']=df['text'].apply(lambda x: pattern.findall(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['mention']=df['mention'].apply(lambda x: [y.lower() for y in x])


>* We also don't want to forget to convert the text into lowercase.

In [13]:
df['text']=df['text'].str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['text']=df['text'].str.lower()


In [14]:
df.head(5)

Unnamed: 0,label,source,text,username,mention
0,From: Trey Radel (Representative from Florida),twitter,rt @nowthisnews: rep. trey radel (r- #fl) slam...,trey radel,[@nowthisnews]
1,From: Mitch McConnell (Senator from Kentucky),twitter,video - #obamacare: full of higher costs and ...,mitch mcconnell,[]
2,From: Kurt Schrader (Representative from Oregon),twitter,please join me today in remembering our fallen...,kurt schrader,[]
3,From: Michael Crapo (Senator from Idaho),twitter,rt @senatorleahy: 1st step toward senate debat...,michael crapo,[@senatorleahy]
4,From: Mark Udall (Senator from Colorado),twitter,.@amazon delivery #drones show need to update ...,mark udall,[@amazon]


> * Let's build edges between the users who have mentioned.
> * To do so, we will use the `mentions` column and iterate over the rows to create edges between the users in the `mention` column.

In [15]:
#first we need to iterate through the mention column and then we need to iterate through usernames column
#and we need to create a tuple of the username and the mention

edges=[]
for idx, val in df.iterrows():
    if len(val['mention']) == 0: #when there are no mentions, we skip the iteration
        continue
    elif len(val['mention']) > 0: #when there are mentions, we iterate through the mentions
        for mention in val['mention']: 
            edges.append((val['username'].lower(), mention.strip('@').lower())) 
            #we append the tuple of the username and the mention to the edges list
            #lower() is used to convert the username to lowercase
            #strip() is used to remove the @ from the mention

In [16]:
edges[:10]

[('trey radel', 'nowthisnews'),
 ('michael crapo', 'senatorleahy'),
 ('mark udall', 'amazon'),
 ('heidi heitkamp', 'usdotfra'),
 ('frederica wilson', 'bbcworld'),
 ('gregg harper', 'mha'),
 ('gregg harper', 'genevrapittman'),
 ('gregg harper', 'medcitynews'),
 ('john dingell', 'skitchp'),
 ('kirsten gillibrand', 'corybooker')]

In [17]:
len(edges)

1843

In [18]:
edges[:10]

[('trey radel', 'nowthisnews'),
 ('michael crapo', 'senatorleahy'),
 ('mark udall', 'amazon'),
 ('heidi heitkamp', 'usdotfra'),
 ('frederica wilson', 'bbcworld'),
 ('gregg harper', 'mha'),
 ('gregg harper', 'genevrapittman'),
 ('gregg harper', 'medcitynews'),
 ('john dingell', 'skitchp'),
 ('kirsten gillibrand', 'corybooker')]

> * Let's count the degree centrality of each user and create a dictionary called `degree` for degree centrality.

>* First, you iterate through the unique users and count the number of edges that are connected to the user.

In [19]:
degree={}
for element in df['username'].unique():
    count = 0
    for edge in edges:
        if element in edge:
            count += 1
    degree[element]=count


>* Let's check whether it captured the degree by looking at 'trey radel' instance.

In [20]:
degree['trey radel']

5

In [21]:
for edge in edges:
    if 'trey radel' in edge:
        print(edge)

('trey radel', 'nowthisnews')
('trey radel', 'markdotdo')
('trey radel', 'treyradel')
('trey radel', 'morrow')
('trey radel', 'redmillennial')


> * We have five degree for 'trey radel'. Let's divide the degree by the number of unique users -1 to calculate degree centrality.

<img src="../week5/degree-centrality.png" width=1000px height=200px />

> * How do we want to find the number of nodes in the graph?

In [22]:
#first, we try the number of unique usernames in df.T
degree['trey radel']/(len(df['username'].unique())-1)

0.009940357852882704

> * But this is incorrect answer. Why?

In [23]:
import networkx as nx
G=nx.Graph()
G.add_edges_from(edges)


In [24]:
nx.degree_centrality(G)['trey radel']

0.0027731558513588465

> * As we talked before, the number of nodes in the graph will be larger than the number of unique users. Because we have to consider the users who have mentioned in the dataset but not in the `label` column.

>* So, we have to find the number of unique users in the edge list. 

In [25]:
degree['trey radel']/(len(set([element for tuple in edges for element in tuple]))-1)

0.0027731558513588465

> * Now we know how to calculate the degree centrality. Let's iterate through the `degree` dictionary and divide the degree by the number of unique users in the edge list.

In [26]:
for key, val in degree.items():
    degree[key]=val/(len(set([element for tuple in edges for element in tuple]))-1)

In [27]:
degree['trey radel']

0.0027731558513588465

>* Q. What can be other attributes for the nodes?

>* There is an information about whether the political leader is a Representative or a Senator.
>* Let's use RegEx to extract that and create a dictionary called `title`.

In [28]:
df['label']

0         From: Trey Radel (Representative from Florida)
1          From: Mitch McConnell (Senator from Kentucky)
2       From: Kurt Schrader (Representative from Oregon)
3               From: Michael Crapo (Senator from Idaho)
4               From: Mark Udall (Senator from Colorado)
                              ...                       
4995        From: Ted Yoho (Representative from Florida)
4996        From: Ted Yoho (Representative from Florida)
4997        From: Ted Yoho (Representative from Florida)
4998        From: Ted Yoho (Representative from Florida)
4999        From: Ted Yoho (Representative from Florida)
Name: label, Length: 5000, dtype: object

In [29]:
pattern=re.compile(r'\([A-Za-z]*')
df['title']=df['label'].str.findall(pattern).str[0].str.strip('(')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['title']=df['label'].str.findall(pattern).str[0].str.strip('(')


In [30]:
df['title'].value_counts()

title
Representative    4025
Senator            975
Name: count, dtype: int64

In [31]:
title={}
for idx, row in df.iterrows():
    title[row['username']]=row['title']

>* There is another information about which State the political leader is from.
>* Let's use RegEx to extract that and create a dictionary called `state`.

>* We are changing the value of Representative and Senator to 'o' and 'x' to use it as a node shape.

In [32]:
title_shape = {key: 'o' if value == 'Representative' else 'x' for key, value in title.items()}

In [33]:
pattern=re.compile(r'from\s.*')
df['state']=df['label'].str.findall(pattern).str[0].str.replace('from','').str.replace(')', '').str.strip()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['state']=df['label'].str.findall(pattern).str[0].str.replace('from','').str.replace(')', '').str.strip()


In [34]:
state={}
for idx, row in df.iterrows():
    state[row['username']]=row['state']


In [35]:
df['state'].value_counts()[:10]

state
Texas             494
California        486
Florida           298
New York          224
Ohio              183
Illinois          183
Pennsylvania      177
Arizona           144
New Jersey        137
North Carolina    134
Name: count, dtype: int64

In [44]:
import random
import seaborn as sns
# Get the unique values from the state dictionary
unique_states = list(set(state.values()))

# Generate a color palette using seaborn
color_palette = sns.color_palette("hls", len(unique_states))

# Create a dictionary to map each unique state to a color
state_colors_dict = {key: color_palette[i] for i, key in enumerate(unique_states)}


In [53]:
state_colors_dict['Florida']

(0.86, 0.7994352941176471, 0.33999999999999997)

In [54]:
df[df['username']=='trey radel']

Unnamed: 0,label,source,text,username,mention,title,state
0,From: Trey Radel (Representative from Florida),twitter,rt @nowthisnews: rep. trey radel (r- #fl) slam...,trey radel,[@nowthisnews],Representative,Florida
625,From: Trey Radel (Representative from Florida),twitter,ûï@markdotdo: @treyradel dig the new tumblr t...,trey radel,"[@markdotdo, @treyradel]",Representative,Florida
707,From: Trey Radel (Representative from Florida),twitter,@morrow_brett ha! ur the man! u text one night...,trey radel,[@morrow],Representative,Florida
959,From: Trey Radel (Representative from Florida),twitter,why does keeping troops out of #syria matter? ...,trey radel,[],Representative,Florida
1768,From: Trey Radel (Representative from Florida),twitter,#obamacareinthreewords - protected privacy? lol,trey radel,[],Representative,Florida
1885,From: Trey Radel (Representative from Florida),twitter,now following @redmillennial hoping to involve...,trey radel,[@redmillennial],Representative,Florida
2422,From: Trey Radel (Representative from Florida),twitter,transportation sec. announced faa can transfer...,trey radel,[],Representative,Florida


In [48]:
from collections import defaultdict
default_dict=defaultdict(lambda: None, state)
for key, value in state.items():
    if value in state_colors_dict.keys():
        default_dict[key]=state_colors_dict.get(value)
color_mapped=dict(default_dict)

In [49]:
color_mapped[list(color_mapped.keys())[0]]

(0.86, 0.7994352941176471, 0.33999999999999997)

>* Practice