#### Name: ABISHEK N
#### Roll NO: 215229101



## LAB 6 :Clustering the job titles of LinkedIn Connections using Greedy Heuristic Algorithm



#**Profile Analysis**

In [14]:
import os
import csv
import numpy as np

# Point this to your 'Connections.csv' file.
CSV_FILE = os.path.join('resources', 'ch04-linkedin', 'Connections.csv')

csvReader = csv.DictReader(open("Connections.csv"), delimiter=',', quotechar='"')
contacts = [row for row in csvReader]

###Simple normalization of company suffixes from address book data


In [15]:
from prettytable import PrettyTable # pip install prettytable
from collections import Counter
from operator import itemgetter

# Define a set of transforms that converts the first item
# to the second item. Here, we're simply handling some
# commonly known abbreviations, stripping off common suffixes, 
# etc.

transforms = [(', Inc.', ''), (', Inc', ''), (', LLC', ''), (', LLP', ''),
               (' LLC', ''), (' Inc.', ''), (' Inc', '')]

companies = [c['Company'].strip() for c in contacts if c['Company'].strip() != '']

for i, _ in enumerate(companies):
    for transform in transforms:
        companies[i] = companies[i].replace(*transform)

pt = PrettyTable(field_names=['Company', 'Freq'])
pt.align = 'l'
c = Counter(companies)

[pt.add_row([company, freq]) for (company, freq) in sorted(c.items(), key=itemgetter(1), reverse=True) if freq > 1]

print(pt)


+--------------------------------------------------+------+
| Company                                          | Freq |
+--------------------------------------------------+------+
| Bishop Heber College, Tiruchirappalli - 620 017. | 4    |
+--------------------------------------------------+------+


###Standardizing common job titles and computing their frequencies

In [16]:
transforms = [
    ('Sr.', 'Senior'),
    ('Sr', 'Senior'),
    ('Jr.', 'Junior'),
    ('Jr', 'Junior'),
    ('CEO', 'Chief Executive Officer'),
    ('COO', 'Chief Operating Officer'),
    ('CTO', 'Chief Technology Officer'),
    ('CFO', 'Chief Finance Officer'),
    ('VP', 'Vice President'),
    ]

# Read in a list of titles and split apart
# any combined titles like "President/CEO."
# Other variations could be handled as well, such
# as "President & CEO", "President and CEO", etc.

titles = []
for contact in contacts:
    titles.extend([t.strip() for t in contact['Position'].split('/')
                  if contact['Position'].strip() != ''])

# Replace common/known abbreviations

for i, _ in enumerate(titles):
    for transform in transforms:
        titles[i] = titles[i].replace(*transform)

# Print out a table of titles sorted by frequency

pt = PrettyTable(field_names=['Job Title', 'Freq'])
pt.align = 'l'
c = Counter(titles)
[pt.add_row([title, freq]) 
 for (title, freq) in sorted(c.items(), key=itemgetter(1), reverse=True) 
     if freq > 1]
print(pt)

# Print out a table of tokens sorted by frequency

tokens = []
for title in titles:
    tokens.extend([t.strip(',') for t in title.split()])
pt = PrettyTable(field_names=['Token', 'Freq'])
pt.align = 'l'
c = Counter(tokens)
[pt.add_row([token, freq]) 
 for (token, freq) in sorted(c.items(), key=itemgetter(1), reverse=True) 
     if freq > 1 and len(token) > 2]
print(pt)

+----------------+------+
| Job Title      | Freq |
+----------------+------+
| Guest Lecturer | 2    |
+----------------+------+
+-----------+------+
| Token     | Freq |
+-----------+------+
| Data      | 5    |
| Associate | 4    |
| Analyst   | 4    |
| Intern    | 2    |
| Guest     | 2    |
| Lecturer  | 2    |
| Student   | 2    |
| Science   | 2    |
+-----------+------+


###Geocoding locations with Google Maps

NOTE:This section contains no locations bcoz there is no geocode generated

In [17]:
for i, c in enumerate(contacts):
    progress = '{0:3d} of {1:3d} - '.format(i+1,len(contacts))
    company = c['Company']
    try:
        location = g.geocode(company, exactly_one=True)
    except:
        print('... Failed to get a location for {0}'.format(company))
        location = None
    
    if location != None:
        c.update([('Location', location)])
        print(progress + company[:50] + ' -- ' + location.address)
    else:
        c.update([('Location', None)])
        print(progress + company[:50] + ' -- ' + 'Unknown Location')

... Failed to get a location for 
  1 of  52 -  -- Unknown Location
... Failed to get a location for 
  2 of  52 -  -- Unknown Location
... Failed to get a location for 
  3 of  52 -  -- Unknown Location
... Failed to get a location for 
  4 of  52 -  -- Unknown Location
... Failed to get a location for 
  5 of  52 -  -- Unknown Location
... Failed to get a location for 
  6 of  52 -  -- Unknown Location
... Failed to get a location for 
  7 of  52 -  -- Unknown Location
... Failed to get a location for Ernst & Young Global Services LLP
  8 of  52 - Ernst & Young Global Services LLP -- Unknown Location
... Failed to get a location for 
  9 of  52 -  -- Unknown Location
... Failed to get a location for MENMOZHI TECHNOLOGIES
 10 of  52 - MENMOZHI TECHNOLOGIES -- Unknown Location
... Failed to get a location for 
 11 of  52 -  -- Unknown Location
... Failed to get a location for 
 12 of  52 -  -- Unknown Location
... Failed to get a location for Tech Mahindra
 13 of  52 - Tech Mahindra --

#**Clustering job titles using a greedy heuristic**

In [18]:
from nltk.util import bigrams

ceo_bigrams = list(bigrams("Chief Executive Officer".split(), pad_left=True, pad_right=True))
cto_bigrams = list(bigrams("Chief Technology Officer".split(), pad_left=True, pad_right=True))

print(ceo_bigrams)
print(cto_bigrams)

print(len(set(ceo_bigrams).intersection(set(cto_bigrams))))

[(None, 'Chief'), ('Chief', 'Executive'), ('Executive', 'Officer'), ('Officer', None)]
[(None, 'Chief'), ('Chief', 'Technology'), ('Technology', 'Officer'), ('Officer', None)]
2


###Jaccard distance calculation

In [19]:
from nltk.metrics.distance import jaccard_distance # pip install nltk

job_title_1 = 'Chief Executive Officer'.split()
job_title_2 = 'Chief Technology Officer'.split()

print(job_title_1)
print(job_title_2)

print()
print('Intersection:')
intersection = set(job_title_1).intersection(set(job_title_2))
print(intersection)

print()
print('Union:')
union = set(job_title_1).union(set(job_title_2))
print(union)

print()
print('Similarity:', len(intersection) / len(union))
print('Distance:', jaccard_distance(set(job_title_1), set(job_title_2)))

['Chief', 'Executive', 'Officer']
['Chief', 'Technology', 'Officer']

Intersection:
{'Officer', 'Chief'}

Union:
{'Chief', 'Officer', 'Executive', 'Technology'}

Similarity: 0.5
Distance: 0.5


In [20]:
job_title_1 = 'Vice President, Sales'.split()
job_title_2 = 'Vice President, Customer Relations'.split()

print(job_title_1)
print(job_title_2)

print()
print('Intersection:')
intersection = set(job_title_1).intersection(set(job_title_2))
print(intersection)

print()
print('Union:')
union = set(job_title_1).union(set(job_title_2))
print(union)

print()
print('Similarity:', len(intersection) / len(union))
print('Distance:', jaccard_distance(set(job_title_1), set(job_title_2)))

['Vice', 'President,', 'Sales']
['Vice', 'President,', 'Customer', 'Relations']

Intersection:
{'President,', 'Vice'}

Union:
{'Customer', 'Sales', 'Vice', 'Relations', 'President,'}

Similarity: 0.4
Distance: 0.6


In [21]:
contacts

[{'First Name': 'Abhishek',
  'Last Name': 'Shrivastava',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'sathish',
  'Last Name': 'kumar',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'Nivetha',
  'Last Name': 'Murugan',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'SAMIDURAI',
  'Last Name': 'S',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'ARUN',
  'Last Name': 'KUMAR M',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'Rahul',
  'Last Name': 'Venkat S',
  'Email Address': '',
  'Company': '',
  'Position': '',
  'Connected On': '06-Aug-22',
  'Location': None},
 {'First Name': 'Kirubha',


In [22]:
import json
# Tweak this distance threshold and try different distance calculations 
# during experimentation
DISTANCE_THRESHOLD = 0.6
DISTANCE = jaccard_distance



def cluster_contacts_by_title():

    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
        ]

    separators = ['/', ' and ', ' & ', '|', ',']

    # Normalize and/or replace known abbreviations
    # and build up a list of common titles.

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Position'] == '':
            contacts[i]['Position'] = ['']
            continue
        titles = [contacts[i]['Position']]
        
        all_titles.extend(titles)

    all_titles = list(set(all_titles))

    clusters = {}
    for title1 in all_titles:
        clusters[title1] = []
        for title2 in all_titles:
            if title2 in clusters[title1] or title2 in clusters and title1 \
                in clusters[title2]:
                continue
            try:
                distance = DISTANCE(set(title1.split()), set(title2.split()))
            except:
                print(title1.split())
                print(title2.split())
                continue

            if distance < DISTANCE_THRESHOLD:
                clusters[title1].append(title2)

    # Flatten out clusters
    clusters = [clusters[title] for title in clusters if len(clusters[title]) > 1]

    # Round up contacts who are in these clusters and group them together
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Position']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('{0} {1}.'.format(
                        contact['FirstName'], contact['LastName'][0]))

    return clustered_contacts


clustered_contacts = cluster_contacts_by_title()

for titles in clustered_contacts:
    common_titles_heading = 'Common Titles: ' + ', '.join(titles)

    descriptive_terms = set(titles[0].split())
    for title in titles:
        descriptive_terms.intersection_update(set(title.split()))
    if len(descriptive_terms) == 0: descriptive_terms = ['***No words in common***']
    descriptive_terms_heading = 'Descriptive Terms: ' \
        + ', '.join(descriptive_terms)
    print(common_titles_heading)
    print('\n'+descriptive_terms_heading)
    print('-' * 70)
    print('\n'.join(clustered_contacts[titles]))
    print()

Common Titles: Summer Intern, Intern

Descriptive Terms: Intern
----------------------------------------------------------------------


Common Titles: Data Analyst, Associate Data Analyst 

Descriptive Terms: Data, Analyst
----------------------------------------------------------------------


Common Titles: Student, Student Researcher

Descriptive Terms: Student
----------------------------------------------------------------------




###How to export data to power a dendogram and node-link tree visualization

In [23]:
pip install cluster

Note: you may need to restart the kernel to use updated packages.


In [24]:
import cluster

In [25]:
contacts

[{'First Name': 'Abhishek',
  'Last Name': 'Shrivastava',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'sathish',
  'Last Name': 'kumar',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'Nivetha',
  'Last Name': 'Murugan',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'SAMIDURAI',
  'Last Name': 'S',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'ARUN',
  'Last Name': 'KUMAR M',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '13-Aug-22',
  'Location': None},
 {'First Name': 'Rahul',
  'Last Name': 'Venkat S',
  'Email Address': '',
  'Company': '',
  'Position': [''],
  'Connected On': '06-Aug-22',
  'Location': None},
 {'First Name':

In [26]:
import nltk
nltk.download('stopwords')
from nltk.metrics.distance import jaccard_distance
from nltk.corpus import stopwords # nltk.download('stopwords')
from cluster import HierarchicalClustering # pip install cluster
import os
CSV_FILE = os.path.join('Connections.csv')

# Tweak this distance threshold and try different distance calculations 
# during experimentation
DISTANCE_THRESHOLD = 0.5
DISTANCE = jaccard_distance

# Adjust sample size as needed to reduce the runtime of the
# nested loop that invokes the DISTANCE function
SAMPLE_SIZE = 500

def cluster_contacts_by_title(csv_file):

    csvReader = csv.DictReader(open(csv_file), delimiter=',', quotechar='"')
    contacts = [row for row in csvReader]
    contacts = contacts[:SAMPLE_SIZE]
    
    transforms = [
        ('Sr.', 'Senior'),
        ('Sr', 'Senior'),
        ('Jr.', 'Junior'),
        ('Jr', 'Junior'),
        ('CEO', 'Chief Executive Officer'),
        ('COO', 'Chief Operating Officer'),
        ('CTO', 'Chief Technology Officer'),
        ('CFO', 'Chief Finance Officer'),
        ('VP', 'Vice President'),
        ]

    separators = ['/', ' and ', '|', ',', ' & ']

    # Normalize and/or replace known abbreviations
    # and build up a list of common titles.

    all_titles = []
    for i, _ in enumerate(contacts):
        if contacts[i]['Position'] == '':
            contacts[i]['Position'] = ['']
            continue
        titles = [contacts[i]['Position']]
        for separator in separators:
            for title in titles:
                if title.find(separator) >= 0:
                    titles.remove(title)
                    titles.extend([title.strip() for title in title.split(separator) if title.strip() != ''])

        for transform in transforms:
            titles = [title.replace(*transform) for title in titles]
            
        contacts[i]['Position'] = titles
        all_titles.extend(titles)

    all_titles = list(set(all_titles))
    
    # Define a scoring function
    def score(title1, title2): 
        return DISTANCE(set(title1.split()), set(title2.split()))

    # Feed the class your data and the scoring function
    hc = HierarchicalClustering(all_titles, score)

    # Cluster the data according to a distance threshold
    clusters = hc.getlevel(DISTANCE_THRESHOLD)

    # Remove singleton clusters
    clusters = [c for c in clusters if len(c) > 1]

    # Round up contacts who are in these clusters and group them together
    clustered_contacts = {}
    for cluster in clusters:
        clustered_contacts[tuple(cluster)] = []
        for contact in contacts:
            for title in contact['Position']:
                if title in cluster:
                    clustered_contacts[tuple(cluster)].append('{0} {1}.'.format(
                        contact['First Name'], contact['Last Name'][0]))

    return clustered_contacts, clusters

def get_descriptive_terms(titles):
    flatten = lambda l: [item for sublist in l for item in sublist]
    title_words = flatten([title.split() for title in titles])
    filtered_words = [word for word in title_words \
                      if word not in stopwords.words('english')]
    counter = Counter(filtered_words)
    descriptive_terms = counter.most_common(2)
    # Get the most common title words from a cluster, ignoring singletons
    descriptive_terms = [t[0] for t in descriptive_terms if t[1] > 1]
    return descriptive_terms


def display_output(clustered_contacts, clusters):    
    for title_cluster in clusters:
        descriptive_terms = get_descriptive_terms(title_cluster)
        common_titles_heading = 'Common Titles: ' + ', '.join((t for t in title_cluster))
        descriptive_terms_heading =  'Descriptive Terms: ' + ', '.join((t for t in descriptive_terms))
        
        print(common_titles_heading)
        print(descriptive_terms_heading)
        print('-' * 70)
        #print(title_cluster)
        #print(clustered_contacts)
        print('\n'.join(clustered_contacts[tuple(title_cluster)]))
        print()


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KARTHIK\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [28]:
clustered_contacts

{('Summer Intern', 'Intern'): [],
 ('Data Analyst', 'Associate Data Analyst '): [],
 ('Student', 'Student Researcher'): []}

In [29]:
def write_d3_json_output(clustered_contacts):
    
    json_output = {'name' : 'My LinkedIn', 'children' : []}

    for titles in clustered_contacts:

        descriptive_terms = get_descriptive_terms(titles)

        json_output['children'].append({'name' : ', '.join(descriptive_terms)[:30], 'children' : [ {'name' : c} for c in clustered_contacts[titles] ] } )
        with open("sample.json", "w") as out_file:
            json.dump(json_output,out_file,indent=1)
            out_file.close()
    
clustered_contacts, clusters = cluster_contacts_by_title(CSV_FILE)
display_output(clustered_contacts, clusters)
write_d3_json_output(clustered_contacts)

Common Titles: Intern, Summer Intern
Descriptive Terms: Intern
----------------------------------------------------------------------
Hariharasudhan D.
Yokeshwaran  G.

Common Titles: Student, Student Researcher
Descriptive Terms: Student
----------------------------------------------------------------------
SHARON S.
Arjun V.

Common Titles: Associate Data Analyst , Data Analyst
Descriptive Terms: Data, Analyst
----------------------------------------------------------------------
SONA U.
Jayasurya V.

