In [1]:
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer() #Initialize lemmatizer
from nltk.corpus import wordnet
from collections import Counter
import itertools

>* We are going to use email data from program-l mailing list to build edges and nodes for the graph.
>* https://www.freelists.org/archive/program-l
> * Let's read file using `pd.read_csv` 

In [2]:
data=pd.read_csv('thread_sample.csv')

>* In `data` dataframe, we have six columns, each representing as follows:
    
    * `thread_id` : unique id for each thread
    * `thread_name` : the first subject of the email
    * `body` : the content of the email 
    * `account` : the email account of the sender 
    * `url` : the url of the email
    * `date` : the date of the email 

### 1. Regular Expression (RegEx) (2pt)

#### 1-1. RegEx to extract usernames (1pt)

In [3]:
data['account'].iloc[0]

"['chojiro1990' 'soronel.haetir' 'chojiro1990' 'soronel.haetir'\r\n 'chojiro1990' 'soronel.haetir' 'chojiro1990' 'soronel.haetir'\r\n 'chojiro1990']"

In [4]:
pattern = re.compile(r"'([^']+)'")
data['account_list'] = data['account'].apply(lambda x: re.findall(pattern, x))
data['account_list']

0     [chojiro1990, soronel.haetir, chojiro1990, sor...
1      [hedvig.jung, dzhovani.chemishanov, hedvig.jung]
2     [justind, david.lant, justind, travis, justind...
3     [birkir.gunnarsson, lras, birkir.gunnarsson, j...
4     [david.lant, markalong64, jamal.mazrui, david....
                            ...                        
95                             [cmusic789, james.homme]
96    [m10fayed, abletec, ntsiklauri2, spg1111, m10f...
97                     [jhomme, jhomme, isidor.nikolic]
98    [programmer651, james.corbett, joseph.lee22590...
99    [pmorales, rbreiten, jude.dashiell, jude.dashi...
Name: account_list, Length: 100, dtype: object

#### 1-2. RegEx to extract date (1pt)

In [5]:
data['date'].iloc[0]

"['2013-06-05T09:50:54.000000000' '2013-06-05T15:42:35.000000000'\r\n '2013-06-05T16:28:36.000000000' '2013-06-05T19:52:01.000000000'\r\n '2013-06-06T09:33:53.000000000' '2013-06-06T15:31:31.000000000'\r\n '2013-06-06T19:15:11.000000000' '2013-06-06T20:04:44.000000000'\r\n '2013-06-07T05:05:21.000000000']"

In [24]:
pattern = re.compile(r"\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}\.\d{9}")
data['date_list'] = data['date'].apply(lambda x: re.findall(pattern, x))
data['date_list'].iloc[0]

['2013-06-05T09:50:54.000000000',
 '2013-06-05T15:42:35.000000000',
 '2013-06-05T16:28:36.000000000',
 '2013-06-05T19:52:01.000000000',
 '2013-06-06T09:33:53.000000000',
 '2013-06-06T15:31:31.000000000',
 '2013-06-06T19:15:11.000000000',
 '2013-06-06T20:04:44.000000000',
 '2013-06-07T05:05:21.000000000']

### 2. Pandas and Text mining (6pt)

#### 2-1. Count the number of users involved in the email conversation (1pt)

In [7]:
#YOUR CODE HERE
data["account_list"].apply(lambda x: len(x))

0      9
1      3
2      7
3      4
4      8
      ..
95     2
96     5
97     3
98     5
99    14
Name: account_list, Length: 100, dtype: int64

In [25]:
data["account_list"].apply(lambda x: len(x)).nlargest(1)

26    38
Name: account_list, dtype: int64

In [26]:
data.iloc[26].thread_id

3170

#### 2-2. Lowercase the body column (1pt)

In [27]:
#YOUR CODE HERE
data['body_lower']=data['body'].str.lower()
data['body_lower']

0     ["hi people! right now, i'm currently doing a ...
1     ['i am. i write you from budapest, hungary. i ...
2     ["hi all. i'm possibly needing to use windows ...
3     ['gang i\'m trying to use vc # express 2008. o...
4     ["hi all, i have a vb form which shows a numbe...
                            ...                        
95    ['hi all. i ’ m looking for an accessible ( ke...
96    ['hi, my mail find you well, i ’ m a iphone us...
97    ["hi, user settings json file has errors it i ...
98    ['tell me if you guys get an attachment if i s...
99    ['hi all. as i said days ago. i worked program...
Name: body_lower, Length: 100, dtype: object

#### 2-3. Removing stopwords (1pt)

In [28]:
#YOUR CODE HERE
stop=stopwords.words('english')
data['stopword']=data['body_lower'].apply(lambda x: ' '.join([word for word in x.split() if word not in stop]))
data['stopword']

0     ["hi people! right now, i'm currently project ...
1     ['i am. write budapest, hungary. work area vis...
2     ["hi all. i'm possibly needing use windows xp ...
3     ['gang i\'m trying use vc # express 2008. firs...
4     ["hi all, vb form shows number rows controls. ...
                            ...                        
95    ['hi all. ’ looking accessible ( keyboard + sc...
96    ['hi, mail find well, ’ iphone user. ’ contact...
97    ["hi, user settings json file errors can't get...
98    ['tell guys get attachment send one. recently,...
99    ['hi all. said days ago. worked programming da...
Name: stopword, Length: 100, dtype: object

#### 2-4. Tokenization and removing punctuations (1pt)

In [29]:
#YOUR CODE HERE
data['token']=data['stopword'].apply(word_tokenize)
data['punct_token']=data['token'].apply(lambda x: [word for word in x if word.isalnum()])
data['punct_token']

0     [hi, people, right, now, i, currently, project...
1     [i, am, write, budapest, hungary, work, area, ...
2     [hi, all, i, possibly, needing, use, windows, ...
3     [trying, use, vc, express, first, page, open, ...
4     [hi, all, vb, form, shows, number, rows, contr...
                            ...                        
95    [all, looking, accessible, keyboard, screen, r...
96    [mail, find, well, iphone, user, contacted, pa...
97    [hi, user, settings, json, file, errors, ca, g...
98    [guys, get, attachment, send, one, recently, c...
99    [all, said, days, ago, worked, programming, da...
Name: punct_token, Length: 100, dtype: object

#### 2-5. Lemmatization (1pt)

In [30]:
#YOUR CODE HERE
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer() #Initialize lemmatizer
from nltk.corpus import wordnet

data["punct_token"]

0     [hi, people, right, now, i, currently, project...
1     [i, am, write, budapest, hungary, work, area, ...
2     [hi, all, i, possibly, needing, use, windows, ...
3     [trying, use, vc, express, first, page, open, ...
4     [hi, all, vb, form, shows, number, rows, contr...
                            ...                        
95    [all, looking, accessible, keyboard, screen, r...
96    [mail, find, well, iphone, user, contacted, pa...
97    [hi, user, settings, json, file, errors, ca, g...
98    [guys, get, attachment, send, one, recently, c...
99    [all, said, days, ago, worked, programming, da...
Name: punct_token, Length: 100, dtype: object

In [31]:
def nltk_tag_to_wordnet_tag(nltk_tag):
    if nltk_tag.startswith('J'): #ADJECTIVE
        return wordnet.ADJ
    elif nltk_tag.startswith('V'): #VERN
        return wordnet.VERB
    elif nltk_tag.startswith('N'): #NOUN        
        return wordnet.NOUN
    elif nltk_tag.startswith('R'): #ADVERB
        return wordnet.ADV
    else:          
        return None

In [32]:
def lemmatize_sentence(sentence):
    # Tokenize the sentence and find the POS tag for each token
    nltk_tagged = nltk.pos_tag(nltk.word_tokenize(sentence))  
    # Tuple of (token, wordnet_tag)
    wordnet_tagged = map(lambda x: (x[0], nltk_tag_to_wordnet_tag(x[1])), nltk_tagged) 
    lemmatized_sentence = []
    for word, tag in wordnet_tagged:
        if tag is None:
            # If no tag was found, then use the word as is
            lemmatized_sentence.append(word)
        else:        
            # Else use the tag to lemmatize the word
            lemmatized_sentence.append(lemmatizer.lemmatize(word, tag))
    return " ".join(lemmatized_sentence)

In [33]:
data['lemma'] = data['punct_token'].apply(lambda x: lemmatize_sentence(' '.join(x))).apply(lambda x: word_tokenize(x))
data['lemma']

0     [hi, people, right, now, i, currently, project...
1     [i, be, write, budapest, hungary, work, area, ...
2     [hi, all, i, possibly, need, use, window, xp, ...
3     [try, use, vc, express, first, page, open, app...
4     [hi, all, vb, form, show, number, row, control...
                            ...                        
95    [all, look, accessible, keyboard, screen, read...
96    [mail, find, well, iphone, user, contact, pack...
97    [hi, user, setting, json, file, error, ca, get...
98    [guy, get, attachment, send, one, recently, cr...
99    [all, say, day, ago, work, program, database, ...
Name: lemma, Length: 100, dtype: object

#### 2-6. Count the frequency of unigrams (1pt)

In [34]:
#YOUR CODE HERE
Counter([item for row in data['lemma'] for item in row]).most_common(10)

[('use', 440),
 ('would', 275),
 ('work', 250),
 ('get', 234),
 ('i', 233),
 ('file', 192),
 ('know', 177),
 ('like', 177),
 ('line', 174),
 ('code', 165)]

### 3. Building edges (2pt)

#### 3-1. Combination

In [21]:
#YOUR CODE HERE
import itertools
edges=[]
for idx, val in data['account_list'].items():
        edges.extend(list(itertools.combinations(val, 2)))

print(len(edges))

2820


#### 3-2. Removing self-loop (1pt)

In [22]:
edges_loop = [edge for edge in edges if edge[0] != edge[1]]

print("Number of edges before removing self-loops:", len(edges))
print("Number of edges after removing self-loops:", len(edges_loop))

Number of edges before removing self-loops: 2820
Number of edges after removing self-loops: 2335
