In [1]:
import os
import glob
import warnings
import collections
import pandas as pd

warnings.filterwarnings('ignore')
sentWritePath = "cleaned/sentiment_analysis/"
networkWritePath = "cleaned/network_analysis/"

### Raw data exploration
Data Source:  [#metoo Digital Media Collection - Twitter Dataset (from Harvard University)](https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/2SRSKJ)
<br>
we explore one partition of raw data at first

In [2]:
raw = pd.read_csv('dataset/metoo_dataset_01.csv', engine='python')
raw.head(10)

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Mon Dec 17 16:46:43 +0000 2018,,,,0,1074707472814882816,,,,...,1419,1102,34,"Brooklyn, NY",Joanne N. Smith,JoanneNSmith,1971,,http://www.ggenyc.org,False
1,,Mon Dec 17 16:11:24 +0000 2018,KatrinaKaif,,,0,1074698585361371136,,,,...,947,4882,0,India,Kaushal Kamdar (( 7 Companies - 164 Divisions )),skg_mne,46547,,https://www.facebook.com/kaushal.kamdar.skg.mb...,False
2,,Mon Dec 17 17:47:44 +0000 2018,Jesuits priests children churchtoo MeTooMoveme...,,https://www.washingtonpost.com/religion/2018/1...,0,1074722830527295488,,,,...,3237,4115,0,"California, USA",#KindnessMatters,HillarySi2016,88772,,,False
3,,Mon Dec 17 12:59:50 +0000 2018,metoomovement,,https://pushviews.com/blog/me-too-movement-in-...,0,1074650375863586817,,,,...,274,645,1,,Jasmine Clark,jasmine_clark2,2077,,,False
4,,Mon Dec 17 14:30:15 +0000 2018,MuteRKelly SurvivingRKelly,,,0,1074673131825049600,,,,...,3914,0,18,,Kelly,D4P6mpbqR3oaHK,7737,,,False
5,,Mon Dec 17 16:07:28 +0000 2018,KatrinaKaif MeTooMovement MeToo,,https://www.spotboye.com/bollywood/Bollywood-i...,50,1074697594704683008,,,,...,573221,651,160,Mumbai,SpotboyE,Spotboye,87895,,https://www.spotboye.com/,True
6,,Mon Dec 17 15:38:16 +0000 2018,MissUniverse MissUniverse2018 Top5 MissUnivers...,,,0,1074690248439939072,,,,...,274,645,1,,Jasmine Clark,jasmine_clark2,2077,,,False
7,,Mon Dec 17 13:56:55 +0000 2018,enough nomore,,http://www.mindfultshirt.com,0,1074664742021664773,,,,...,13,200,6,,C.R.E.B.,nicochase,1019,,,False
8,,Mon Dec 17 16:16:15 +0000 2018,,,,0,1074699807585918978,,,,...,11,13,0,India,subhrat,subhrat7,2050,,,False
9,,Mon Dec 17 15:43:46 +0000 2018,MeToo MeTooIndia MeTooLiars MeToo4Publicity Fe...,,,0,1074691630798524417,,,,...,206,757,1,"Bengaluru, India",Baskaran,MyNation_b77ka,13969,,,False


### Concatenate all CSV files
concatenate csv files (**partitions**) into one bulk file for analysis

may take time up to 15 - 20 min

In [3]:
extension = 'csv'
all_filenames = [i for i in glob.glob(f'./dataset/*.{extension}')]
# combine all files in the list
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames])
# export to csv
combined_csv.to_csv("combined/metoo_combined.csv", index=True, encoding='utf-8-sig')

### Choose metoo_combined.csv as target dataset 
metoo_combined.csv is the concatenated data of 5 rondom partitions from 33 partitions

may take time up to 15 - 20 min

In [4]:
df = pd.read_csv('combined/metoo_combined.csv', engine='python')
# create dataframe for sentiment analysis
df = df[['id', 'text', 'retweet_id', 'retweet_screen_name']].dropna()
# create dataframe for network analysis
suspectDF = df[['id', 'text', 'retweet_id']].dropna()

dataframe preview: for sentiment analysis, we need four columns -> 
**id**, **text**, **retweet_id**, **retweet_screen_name**

In [5]:
df.head(10)

Unnamed: 0,id,text,retweet_id,retweet_screen_name
0,1050071391125495810,RT @circmovie: #MeToo #MenToo https://t.co/v6x...,1.050068858776838e+18,circmovie
1,1050071441348186117,RT @bhaskar_vats: #FakeCases #PindaDaanofWives...,1.0496775968734863e+18,bhaskar_vats
2,1050071303909138432,RT @koenamitra: Can someone please bring back ...,1.0497396288934298e+18,koenamitra
3,1050071289539452928,RT @ShefVaidya: This is the REAL #metoo. But t...,1.050046996168368e+18,ShefVaidya
4,1050071215883186177,RT @mvmeet: Ladies said he had golden HEART\nI...,1.050033326201983e+18,mvmeet
6,1050071336717049857,RT @DeathEndsFun: Read @ghazalawahab’s disturb...,1.0500118104750449e+18,DeathEndsFun
7,1050071318765264896,RT @JhaSanjay: I can’t imagine a more morally ...,1.0500151610362756e+18,JhaSanjay
9,1050071384666333184,RT @ShefVaidya: This is the REAL #metoo. But t...,1.050046996168368e+18,ShefVaidya
10,1050071636324573189,RT @shaktisinhgohil: #MeToo : केन्द्रीय मंत्री...,1.0497301098196337e+18,shaktisinhgohil
11,1050071342534389760,RT @tavleen_singh: It does not have my support...,1.0500620974998652e+18,tavleen_singh


### Choose our suspect target and sort the suspect name list
read suspect information in our predefined suspect info text file and store info in a list

In [6]:
suspectNameList = []
with open('metadata/suspectInfo.txt', 'r') as f:
    for line in f.readlines():
        suspectName, followers, pattern = line.split(",")
        item = (suspectName, int(followers), pattern.rstrip('\n'))
        suspectNameList.append(item)
        
target = sorted(suspectNameList, key=lambda x: x[1], reverse=True)
print(target)

[('Cristiano Ronaldo', 91369000, 'Cristiano Ronald|CristianoRonald|Cr7|Cristiano Ronaldo dos Santos Aveiro '), ('Neil deGrasse Tyson', 14495000, '@neiltyson|Neil deGrasse Tyson|deGrasse|haydenplanetarium.org/tyson/'), ('Russell Simmons', 4525000, 'Russell Simmons|UncleRUSH'), ('Al Franken', 965000, 'Al Franken|alfranken'), ('Mark Halperin', 311000, 'Mark Halperin|MarkHalperin|Halperin'), ('Harvey Weinstein', 14000, 'Weinstein|HarveyWeinstein|Harvey Scissorhands|HarveyScissorhands'), ('Larry Nassar', 2844, 'Larry Nassar|Larry_Nassar|LarryNassar|Nassar '), ('Matt Lauer', 991, 'Matt Lauer|MattLauer|Lauer')]


### Create the dataframe for sentiment analysis and network analysis
for sentiment analysis data,
we need four columns which is **id**, **text**, **retweet_id**, **retweet_screen_name** <br>
for network analysis data,
we need two columns which is **id** (node) and **retweet_id** (node) to find the relationship (edge) between them

In [7]:
suspectMap = {}
for suspectName, followers, pattern in target:
    # generate CSV file for sentiment analysis
    filename = "".join(suspectName.split())
    sentDF = df[df['text'].str.contains(pat=pattern ,case=False)]
    sentDF.to_csv(f"{sentWritePath}{filename}.csv", index=False, encoding='utf-8-sig')
    # generate text file for network analysis
    tmpDF = suspectDF[suspectDF['text'].str.contains(pat=pattern ,case=False)]
    tmpDF = tmpDF[["id","retweet_id"]]
    print(f"suspect name: {suspectName}\n")
    print(f"total number of rows: {len(tmpDF)}\n")
    print(tmpDF)
    networkList = [(tmpDF.iloc[i, 0], tmpDF.iloc[i, 1]) for i in range(len(tmpDF))]
    # assign suspect name as key, [(id, retweet_id)] as value
    print("=======================================\n")
    
    suspectMap[suspectName] = networkList


suspect name: Cristiano Ronaldo

total number of rows: 394

                          id              retweet_id
16315    1049966662261325824   1.049611258616574e+18
72915    1049820106501558273  1.0497779191978844e+18
80219    1050043761877811200  1.0500427970998067e+18
97217    1049896974491758596  1.0497358776001823e+18
137556   1084174751998844929   1.083779957908693e+18
...                      ...                     ...
2127616  1154251744072720385  1.1542467111218708e+18
2127710  1154248866889228288  1.1542467111218708e+18
2131737  1154107260936826880   1.153402787386069e+18
2274217  1176506112033599488   1.176121742257021e+18
2339397  1192471026648506369   1.192470902899794e+18

[394 rows x 2 columns]

suspect name: Neil deGrasse Tyson

total number of rows: 19

                          id              retweet_id
442514   1052032346621390848  1.0520323294121738e+18
447057   1052144101766909952   1.052137451312087e+18
763082   1084480376381341696  1.0844590189099008e+18
102740

## Build subgraph for each suspect list data

we use two functions to build the graph structured data <br>
**buildSubGraph** build adjacency map from network lists of each suspect<br>
**mapToList** create new adjacency list from adjacency map

In [8]:
def buildSubGraph(networkList):
    # create adjacency map from network lists of each suspect
    adjMap = collections.defaultdict(dict)
    edgeSet = set()
    
    for tweeterId, retweetId in networkList:
        if tweeterId not in adjMap[retweetId]:
            adjMap[retweetId][tweeterId] = []

        for node in adjMap[retweetId].keys():
            if node != tweeterId:
                uniqueKey = sorted([str(node), str(tweeterId)])
                key = (uniqueKey[0], uniqueKey[1])
                if key not in edgeSet:
                    edgeSet.add(key)
                    adjMap[retweetId][node].append(tweeterId)
                
    return adjMap

In [9]:
def mapToList(adjMap):
    # create new adjacency list from adjacency map
    adjList = []
    for group, groupList in adjMap.items():
        for node, nodeList in groupList.items():
            if nodeList:
                for target in nodeList:
                    adjList.append(f"{node} {target}")
    return adjList

In [10]:
def writeEdgeList(adjList, suspectName, fileType='txt'):
    filename = "".join(suspectName.split())
    with open(f'{networkWritePath}{filename}.{fileType}', 'w') as f:
        for txt in adjList:
            f.write(f"{txt}\n")

### Create subgraph list data as text file
start creating subgraph and transorm adjacency map into adjacency list and output edge list

In [11]:
for suspectName, networkList in suspectMap.items():
    adjMap = buildSubGraph(networkList)
    adjList = mapToList(adjMap)
    writeEdgeList(adjList, suspectName)

#### Setup helper union find class (Probably won't be needed)

In [12]:
class UnionFind:
    def __init__(self):
        self.parent = {}
        self.count = 0
        
    def find(self, x):
        if self.parent[x] != x:
            self.parent[x] = self.find(self.parent[x])
            
        return self.parent[x]
    
    def union(self, x, y):
        rootX = self.find(x)
        rootY = self.find(y)
        
        if rootX != rootY:
            self.parent[rootX] = rootY
            self.count -= 1
            
    def add(self, x):
        if x not in self.parent:
            self.parent[x] = x
            self.count += 1