In [1]:
import numpy as np
import pandas as pd

In [2]:
target = ['doctor', 'engineer', 'teacher', 'lawyer', 'accountant', 'nurse', 'police', 'architect', 'dentist', 'pharmacist']

In [3]:
df = pd.read_excel('TaskData02.xlsx')

In [4]:
df.to_csv('CareerData.csv', index = False)
df

Unnamed: 0,Profession
0,cokter
1,dentiists
2,Enginir
3,PoLICE
4,engneiear
...,...
1401,enginir
1402,aCcountANT
1403,accountant
1404,POlIcE


In [5]:
df = df.values.tolist()

In [6]:
df = np.array(df)
df_lower = np.char.lower(df)
df_lower

array([['cokter'],
       ['dentiists'],
       ['enginir'],
       ...,
       ['accountant'],
       ['police'],
       ['enjineer']], dtype='<U12')

In [7]:
target.sort()
target

['accountant',
 'architect',
 'dentist',
 'doctor',
 'engineer',
 'lawyer',
 'nurse',
 'pharmacist',
 'police',
 'teacher']

In [8]:
# First let us try the graph method.
# Lets make an array of dict
from collections import defaultdict
from collections import Counter

def count_letter_occurrences(words):
    letter_counts = defaultdict(lambda: defaultdict(int))
    
    for word in words:
        for letter in word:
            letter_counts[word][letter] += 1
    
    # Convert defaultdict to regular dictionary
    letter_counts_dict = {}
    for word, counts in letter_counts.items():
        letter_counts_dict[word] = dict(counts)
    
    return letter_counts_dict

In [9]:
## Now, our function is ready. 
CountDict = count_letter_occurrences(target)
CountDict

{'accountant': {'a': 2, 'c': 2, 'o': 1, 'u': 1, 'n': 2, 't': 2},
 'architect': {'a': 1, 'r': 1, 'c': 2, 'h': 1, 'i': 1, 't': 2, 'e': 1},
 'dentist': {'d': 1, 'e': 1, 'n': 1, 't': 2, 'i': 1, 's': 1},
 'doctor': {'d': 1, 'o': 2, 'c': 1, 't': 1, 'r': 1},
 'engineer': {'e': 3, 'n': 2, 'g': 1, 'i': 1, 'r': 1},
 'lawyer': {'l': 1, 'a': 1, 'w': 1, 'y': 1, 'e': 1, 'r': 1},
 'nurse': {'n': 1, 'u': 1, 'r': 1, 's': 1, 'e': 1},
 'pharmacist': {'p': 1,
  'h': 1,
  'a': 2,
  'r': 1,
  'm': 1,
  'c': 1,
  'i': 1,
  's': 1,
  't': 1},
 'police': {'p': 1, 'o': 1, 'l': 1, 'i': 1, 'c': 1, 'e': 1},
 'teacher': {'t': 1, 'e': 2, 'a': 1, 'c': 1, 'h': 1, 'r': 1}}

In [10]:
import matplotlib.pyplot as plt

In [11]:
def plot_letter_occurrences(words):
    letter_counts = count_letter_occurrences(words)
    
    for word, counts in letter_counts.items():
        # Extract letters and counts
        letters = list(counts.keys())
        counts_values = list(counts.values())
        
        # Plot
        plt.figure(figsize=(8, 6))
        plt.plot(letters, counts_values, color='skyblue')
        plt.xlabel('Letters')
        plt.ylabel('Count')
        plt.title(f'Letter occurrences for word: {word}')
        plt.show()

### Let's go ahead with Leveshtein distance
    Leveshtine distance is basically a problem of 2D dp. That's why I found it interesting. 

In [12]:
def levenshtein_distance(s1, s2):
    ## Base case
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

In [24]:
print(levenshtein_distance('Anshuman', 'Pranjal'))

7


In [14]:
### Now, we will check for each word in the data frame for the leveshtine distance from each profession and for each
### word, we will give the word with least leveshtine distance
l = len(df_lower)
bestChoice = []
for i in range(l):
    min_dist = 0
    distances = []
    for profession in target:
        ans = levenshtein_distance(df_lower[i, 0], profession)
        distances.append(ans)
    min_dist = min(distances)
    min_index = distances.index(min_dist)       # -> This would give the index of the min distance
    bestChoice.append(target[min_index])
    distances.clear()
    
bestChoice
        

['doctor',
 'dentist',
 'engineer',
 'police',
 'engineer',
 'doctor',
 'dentist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'teacher',
 'doctor',
 'nurse',
 'architect',
 'teacher',
 'engineer',
 'dentist',
 'police',
 'engineer',
 'engineer',
 'engineer',
 'accountant',
 'teacher',
 'police',
 'dentist',
 'pharmacist',
 'doctor',
 'police',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'lawyer',
 'dentist',
 'doctor',
 'doctor',
 'teacher',
 'teacher',
 'doctor',
 'doctor',
 'police',
 'teacher',
 'engineer',
 'teacher',
 'engineer',
 'teacher',
 'architect',
 'engineer',
 'pharmacist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'accountant',
 'dentist',
 'teacher',
 'police',
 'doctor',
 'architect',
 'engineer',
 'architect',
 'doctor',
 'lawyer',
 'engineer',
 'dentist',
 'teacher',
 'engineer',
 'engineer',
 'police',
 'teacher',
 'architect',
 'lawyer',
 'engineer',
 'accountant',
 'teacher',
 'engineer',
 'police',
 'engineer',
 'police',
 'teac

### Jaccard Similarity
    Let's also take Jaccard similarity into account. 

In [15]:
def intersection(str1, str2):
    set1 = set(str1)
    set2 = set(str2)
    
    intersection = set1.intersection(set2)

    result = ''.join(intersection)
    
    return len(result)

In [16]:
def union(s1, s2):
    from collections import Counter
    
    count1 = Counter(s1)
    count2 = Counter(s2)
    
    union_count = Counter()
    
    for char in count1:
        union_count[char] = max(count1[char], count2[char])
    for char in count2:
        if char not in union_count:
            union_count[char] = count2[char]
    
    result = []
    for char in s1 + s2:
        if union_count[char] > 0:
            result.append(char)
            union_count[char] -= 1
    
    return ''.join(result)

In [17]:
def Jaccard_similarity(s1, s2):
    l1 = len(s1)
    l2 = len(s2)
    it = intersection(s1, s2)
    that = len(union(s1, s2))
    return (it/that)

In [18]:
## Now, we will perform Jaccard similarity on each element of the df_lower for each of the professions. 
## The profession with max Jaccard similarity will be the best choice
l = len(df_lower)
bestChoice1 = []
for i in range(l):
    min_dist = 0
    similarity = []
    for profession in target:
        ans = Jaccard_similarity(df_lower[i, 0], profession)
        similarity.append(ans)
    max_sim = max(similarity)
    max_index = similarity.index(max_sim)       # -> This would give the index of the min distance
    bestChoice1.append(target[max_index])
    
bestChoice1

['doctor',
 'dentist',
 'engineer',
 'police',
 'engineer',
 'doctor',
 'dentist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'teacher',
 'doctor',
 'nurse',
 'architect',
 'teacher',
 'engineer',
 'dentist',
 'police',
 'engineer',
 'engineer',
 'engineer',
 'accountant',
 'teacher',
 'police',
 'dentist',
 'pharmacist',
 'doctor',
 'police',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'lawyer',
 'dentist',
 'doctor',
 'doctor',
 'teacher',
 'teacher',
 'doctor',
 'doctor',
 'police',
 'teacher',
 'engineer',
 'teacher',
 'engineer',
 'teacher',
 'architect',
 'engineer',
 'pharmacist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'accountant',
 'dentist',
 'teacher',
 'police',
 'doctor',
 'architect',
 'engineer',
 'teacher',
 'doctor',
 'lawyer',
 'engineer',
 'dentist',
 'teacher',
 'engineer',
 'engineer',
 'police',
 'teacher',
 'architect',
 'lawyer',
 'engineer',
 'accountant',
 'teacher',
 'engineer',
 'police',
 'engineer',
 'police',
 'teache

### Cosine Distance
    Let's also consider Cosine distance. 

In [19]:
def word_to_vector(word):
    # Create a dictionary to count characters in the word
    char_count = Counter(word)
    
    # Create a vector representation of the word
    vector = np.zeros(26)  # Assuming only lowercase English alphabets
    
    for char, count in char_count.items():
        if char.isalpha():
            index = ord(char) - ord('a')  # Convert character to index (0-25)
            vector[index] = count
    
    return vector

In [20]:
def cosine_similarity(vector1, vector2):
    dot_product = np.dot(vector1, vector2)
    norm1 = np.linalg.norm(vector1)
    norm2 = np.linalg.norm(vector2)
    
    similarity = dot_product / (norm1 * norm2)
    return similarity

In [21]:
l = len(df_lower)
def correct_pred(df_lower, target_words):
    
    BestMatch = []
    
    for i in range(l):    
        misspelled_vector = word_to_vector(df_lower[i, 0])
        
        # Convert each target word into a vector and calculate cosine similarity
        similarities = {}
        for target_word in target_words:
            target_vector = word_to_vector(target_word)
            similarity = cosine_similarity(misspelled_vector, target_vector)
            similarities[target_word] = similarity
        
        # Find the target word with the highest cosine similarity
        best_match = max(similarities, key=similarities.get)
        BestMatch.append(best_match)
        
    return BestMatch

In [22]:
bestChoice2 = correct_pred(df_lower, target)
bestChoice2

['doctor',
 'dentist',
 'engineer',
 'police',
 'engineer',
 'doctor',
 'dentist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'teacher',
 'doctor',
 'nurse',
 'architect',
 'teacher',
 'engineer',
 'dentist',
 'police',
 'engineer',
 'engineer',
 'engineer',
 'accountant',
 'teacher',
 'police',
 'dentist',
 'pharmacist',
 'doctor',
 'police',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'lawyer',
 'dentist',
 'pharmacist',
 'doctor',
 'teacher',
 'teacher',
 'doctor',
 'doctor',
 'police',
 'teacher',
 'engineer',
 'teacher',
 'engineer',
 'teacher',
 'architect',
 'engineer',
 'pharmacist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'accountant',
 'dentist',
 'teacher',
 'police',
 'accountant',
 'architect',
 'engineer',
 'architect',
 'architect',
 'lawyer',
 'engineer',
 'dentist',
 'teacher',
 'engineer',
 'engineer',
 'police',
 'teacher',
 'architect',
 'lawyer',
 'engineer',
 'accountant',
 'teacher',
 'engineer',
 'police',
 'engineer',
 'poli

### Final Outcome
    Now, we have predictions from all the algorithms. So, we will consider the outcome given by any two of the three 
    algorithms. If all the algorithms disagree, we will consider the answer of Leveshtein distance as it is an outcome
    of dynamic programming would be thus more accurate.

In [23]:
finalChoice = []
for i in range(l):
    ## we will return the profession, which would be supported by two other of the bestChoiceis.
    if bestChoice[i] == bestChoice1[i] != bestChoice2[i]:
        finalChoice.append(bestChoice)
    elif bestChoice1[i] == bestChoice2[i] != bestChoice[i]:
        finalChoice.append(bestChoice1)
    elif bestChoice[i] == bestChoice2[i] != bestChoice1[i]:
        finalChoice.append(bestChoice2[i])
    else:
        finalChoice.append(bestChoice[i])
        
finalChoice

['doctor',
 'dentist',
 'engineer',
 'police',
 'engineer',
 'doctor',
 'dentist',
 'police',
 'engineer',
 'dentist',
 'nurse',
 'teacher',
 'doctor',
 'nurse',
 'architect',
 'teacher',
 'engineer',
 'dentist',
 'police',
 'engineer',
 'engineer',
 'engineer',
 'accountant',
 'teacher',
 'police',
 'dentist',
 'pharmacist',
 'doctor',
 'police',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'dentist',
 'doctor',
 'lawyer',
 'dentist',
 ['doctor',
  'dentist',
  'engineer',
  'police',
  'engineer',
  'doctor',
  'dentist',
  'police',
  'engineer',
  'dentist',
  'nurse',
  'teacher',
  'doctor',
  'nurse',
  'architect',
  'teacher',
  'engineer',
  'dentist',
  'police',
  'engineer',
  'engineer',
  'engineer',
  'accountant',
  'teacher',
  'police',
  'dentist',
  'pharmacist',
  'doctor',
  'police',
  'dentist',
  'doctor',
  'dentist',
  'doctor',
  'dentist',
  'doctor',
  'lawyer',
  'dentist',
  'doctor',
  'doctor',
  'teacher',
  'teacher',
  'doctor',
  'doctor',
  'po

## THANK YOU