# Preliminary - Read the CSV

In [290]:
import csv

with open('class.txt') as csvfile:
    for row in csvfile:
          print(row)

Kshemkalyani, Ajay  - parallel computation: models, algorithms, limits|Filtering and Prediction of Hidden Markov Models

Sylvia Wolak  - Descartes' World|American Fiction & Mass Culture

Ali Tafti  - Managerial Decision Making|The Design & Analysis of Trading Agents|Topics in Data Science

Andy Johnson  - Cell & Molecular Biology|Computer Aided Visualization and Design

Vincent Adiutori  - American Fiction & Mass Culture

Alex Furman  - Probabilistic Graphical Models|Distributed Computing through Combinatorial Topology

John Bell  - software engineering|Photo-Lab: The Visual Performance of Rights|Art After India

Elise Archias  - Collections and Visual Knowledge in Early Modern Europe|Intro. to Modernism: Past, Future, Exile, Home

Craig D. Foster  - Mechanics of Solids|Analytic Geometry & Calculus

Lu, V Hui  - innovation and technology management ii|Models of Computation|Biomaterials|Pattern Recognition and Machine Learning|Computational Fluid Dynamics|Writing and Speaking Chineze I


# Part I - Split the rows into logical strings

## Step 1:
First we have to split the name of the professor from the classes

In [415]:
def separate_name_from_classes(row):
    return row.split("  - ")

row = "Kshemkalyani, Ajay  - parallel computation: models, algorithms, limits|Filtering and Prediction of Hidden Markov Models"
name, classes = separate_name_from_classes(row)
print(name, classes)

Kshemkalyani, Ajay parallel computation: models, algorithms, limits|Filtering and Prediction of Hidden Markov Models


## Step 2:
Then we have to separate the classes into a list of classes

In [414]:
def separate_classes(classes):
    return re.split("\|", classes)

classes_list = separate_classes(classes)
print(classes_list)

['Biology', 'Science of Mind']


## Conclusion:
Now we have the logic to split each row into logical strings that we can later clean up.

# Part II - Handling Different Name Formats

In this CSV, names seem to be in the following formats:
    1. John Doe
    2. John H. Doe
    3. John H Doe
    4. Doe, John
    5. Doe, H. John
    6. Doe, H John
    7. Doe
    8. john.doe
    
In this case it would be very simple to create regular expressions to handle the various name formats into one single format. I'll use the following universal format for testing:
 - John Doe (ignoring middle initial)

## Step 1:
Create regular expressions and get the last name only (only last name required for this assignment)

In [96]:
name_formats = ['John Doe', 'John H. Doe', 'John H Doe', 'Doe, John', 'Doe, H. John', 'Doe, H John']

In [97]:
# John Doe
reg1 = re.match(r"(?P<first_name>\w+) (?P<last_name>\w+)", name_formats[0])
print("{0} {1}".format(reg1.group('first_name'), reg1.group('last_name'))) # success!

John Doe


In [98]:
# John H. Doe and John H Doe
pattern = re.compile(r"(?P<first_name>\w+) (?P<middle_initial>)(\w+.|\w+) (?P<last_name>\w+)")
reg2 = re.match(pattern, name_formats[1])
reg3 = re.match(pattern, name_formats[2])
print("{0} {1}".format(reg2.group('first_name'), reg2.group('last_name'))) # success!
print("{0} {1}".format(reg3.group('first_name'), reg3.group('last_name'))) # success!

John Doe
John Doe


In [87]:
# Doe, John
reg4 = re.match(r"(?P<last_name>\w+), (?P<first_name>\w+)", name_formats[3])
print("{0} {1}".format(reg4.group('first_name'), reg4.group('last_name'))) # success!

John Doe


In [94]:
# Doe, H. John and Doe, H John
pattern = re.compile(r"(?P<last_name>\w+), (?P<middle_initial>)(\w+.|\w+) (?P<first_name>\w+)")
reg5 = re.match(pattern, name_formats[4])
reg6 = re.match(pattern, name_formats[5])
print("{0} {1}".format(reg5.group('first_name'), reg5.group('last_name'))) # success!
print("{0} {1}".format(reg6.group('first_name'), reg6.group('last_name'))) # success!

John Doe
John Doe


## Step 2:
Now combine the multiple regular expressions into 2 regular expressions with the following format:
 - John Doe
 - Doe, John
 
 For example, any name with the format "first m.i. last" should be parsed using one regular expression.

In [105]:
# <firstname> <m.i.>(optional) <lastname>
pattern = re.compile(r"(?P<first_name>\w+)( (?P<middle_initial>)(\w+.|\w+) | )(?P<last_name>\w+)")
reg1 = re.match(pattern, name_formats[0])
reg2 = re.match(pattern, name_formats[1])
reg3 = re.match(pattern, name_formats[2])
print(reg1.group('last_name'))
print(reg2.group('last_name'))
print(reg3.group('last_name')) # success!

Doe
Doe
Doe


In [99]:
# <lastname>, <m.i>(optional) or <m.i>.(optional) <firstname>
pattern = re.compile(r"(?P<last_name>\w+),( (?P<middle_initial>)(\w+.|\w+) | )(?P<first_name>\w+)")
reg4 = re.match(pattern, name_formats[3])
reg5 = re.match(pattern, name_formats[4])
reg6 = re.match(pattern, name_formats[5])
print(reg4.group('last_name'))
print(reg5.group('last_name'))
print(reg6.group('last_name')) # success!

Doe
Doe
Doe


## Step3:
Let's combile the two into one easy to use function

In [511]:
def remove_punctuation(word):
    remove_punc_pattern = r'\w*'
    return re.match(remove_punc_pattern, word).group(0)

def get_name(unknown_name_format):
    
    name_dict = {}
    pattern = None
    
    # format is firstname.lastname
    if len(unknown_name_format.split('.')) == 2 and len(unknown_name_format.split(' ')) == 1:
        first_name, last_name = unknown_name_format.split('.')
        return {'last_name':last_name.capitalize(), 'first_name':first_name.capitalize()}
        
    # one word in name
    elif len(unknown_name_format.split(' ')) == 1:
        return {'last_name':remove_punctuation(unknown_name_format).capitalize()}
    
    # if comma after first word
    elif unknown_name_format.split(" ")[0][-1] == ',':
        pattern = re.compile(r"(?P<last_name>\w+,)\s(((?P<middle_initial>\w+.|\w+)\s)|\b)((?P<first_name>\w+))|\b")
        
    else:
        pattern = re.compile(r"(?P<first_name>\w+(.|\b))\s*(((?P<middle_initial>\w+|\w+.)\s)|\b)(?P<last_name>\w*)")
    
    reg = re.match(pattern, unknown_name_format)

    if reg.group('first_name'):
        name_dict['first_name'] = remove_puncuation(reg.group('first_name')).capitalize()

    if reg.group('middle_initial'):
        name_dict['middle_initial'] = remove_punctuation(reg.group('middle_initial')).capitalize()

    if reg.group('last_name'):
        name_dict['last_name'] = remove_punctuation(reg.group('last_name')).capitalize()
    
    return name_dict;

print(get_name("John Doe"))
print(get_name("John H. Doe"))
print(get_name("John H Doe"))
print(get_name("Doe, John"))
print(get_name("Doe, H John"))
print(get_name("Doe, H. John"))
print(get_name("Doe"))
print(get_name("Doe,"))
print(get_name("J. Homer Doe"))
print(get_name("john.doe"))

{'last_name': 'Doe', 'first_name': 'John'}
{'last_name': 'Doe', 'middle_initial': 'H', 'first_name': 'John'}
{'last_name': 'Doe', 'middle_initial': 'H', 'first_name': 'John'}
{'last_name': 'Doe', 'first_name': 'John'}
{'last_name': 'Doe', 'middle_initial': 'H', 'first_name': 'John'}
{'last_name': 'Doe', 'middle_initial': 'H', 'first_name': 'John'}
{'last_name': 'Doe'}
{'last_name': 'Doe'}
{'last_name': 'Doe', 'middle_initial': 'Homer', 'first_name': 'J'}
{'first_name': 'John', 'last_name': 'Doe'}


## Conclusion
Now we have the proper regular expressions to handle the various name formats that exist.

# Part III - Clean up the classes
Now that we've cleaned up the names, let's clean up the classes. By looking at the classes you'll notice that there are the following issues:
    1. Incorrect spelling in some cases
    2. Incorrect caps
    3. More? (look at the data more closely for more possible issues)

## Step 1:
Let's begin with ensuring each sentence has proper spelling. This is especially difficult because we have to account for special cases such as roman numerals and abbreviations (i.e. intro).

For the first step of spelling, I will use the python library "enchant" to discover when a word is mispelled. Take note that it looks like the person who wrote this document made single character mistakes. We can fix that with echant using sudo logic that looks like this:

  ```
  if a word is mispelled
    suggest new words
    iterate through the suggested words
    if the new suggested word is the same length but has once character off
      take that suggestion
    ```

In [398]:
import enchant

def correct_sentence_spelling(sentence):
    d = enchant.Dict("en_US")
    corrected_sentence = []
    
    for word_with_special_chars in sentence.split(" "):
        word = ''
        reg = None
        
        # replace any '&' with 'and'
        if word_with_special_chars == '&':
            word = 'and'
            
        reg = re.match(r"(?P<pre>[.,!?;:']*)(?P<word>[\w&]*)(?P<post>[.,!?;:']*)", word_with_special_chars) # group words and punctuation
        alpha_numeric_word = reg.group('word')
        
        #save punctuation before and after the word
        pre = reg.group('pre')
        post = reg.group('post')
        
        if d.check(alpha_numeric_word) == False: # word isn't spelled correctly

            # find first closest suggested word
            for suggested_word in d.suggest(alpha_numeric_word):
                if len(alpha_numeric_word) == len(suggested_word):
                    word = suggested_word
                    break
        else:
            word = alpha_numeric_word
        
        word = word_with_special_chars.replace(word_with_special_chars, word)
        corrected_sentence.append(pre+word+post)
    return " ".join(corrected_sentence)

example_class_list = ['parallel computation: modelz, algorithms, limitz',
                      'Filtering and Prediction of Hidden Markov Modelz', 'computer vizion part ii', 
                      "Descartes' World", 'American Fiction & Mass Culture\n', 'Biomaterials']

for sentence in example_class_list:
    print(correct_sentence_spelling(sentence))


parallel computation: models, algorithms, limits
Filtering and Prediction of Hidden Markov Models
computer vision part ii
Descartes' World
American Fiction & Mass Culture
False


['Bio materials',
 'Bio-materials',
 'Materials',
 "Material's",
 'Immaterial',
 'Material',
 "Materiel's",
 'Bilateral',
 'Imperials',
 'Immaterially',
 "Imperial's"]

## Step 2:
Now that we've ensured the spelling is correct for the most part, we need to find a way to capitalize each word appropriately. We should take the following rules into account:
    - First word of a sentence is capitalized
    - Last word of a sentence is capitalized
    - The following words will not be capitalized: a, an, the, at, by, for, in, of, on, to, up, and, as, but, or. 

[Word Source](http://grammar.yourdictionary.com/capitalization/rules-for-capitalization-in-titles.html#QKr4elbmMtimKfJz.99)


In [405]:
import string
import re

def fix_sentence_case(sentence):
    
    lower_case_words = ['a', 'an', 'the', 'at', 'by', 'for', 'in', 'of', 'on', 'to', 'up', 'and', 'as', 'but', 'or']
    
    old_sentence_list = sentence.split(" ")
    
    first_word = old_sentence_list[0].capitalize()
    
    if len(old_sentence_list) == 1:
        return first_word
    
    last_word = old_sentence_list[-1].capitalize()
    
    new_sentence = []
    new_sentence.append(first_word)
    
    old_sentence_list.pop(0) # remove first word, it's been capitalized and saved
    
    if len(old_sentence_list) > 1:
        old_sentence_list.pop(len(old_sentence_list)-1) # remove the last word, it's been capitalized and saved

    for word in old_sentence_list:
            
        reg_ex = re.match(r"(\w+)", word) # strip special characters i.e. ':' or ','
        
        # test that the word is alpha numeric
        if reg_ex is not None and reg_ex.group(0) not in lower_case_words:
            alpha_numeric_word = reg_ex.group(0)
            capitalized_word = alpha_numeric_word.capitalize()
            word = word.replace(alpha_numeric_word, capitalized_word) # maintain special chars i.e. 'Test:' or 'yes,'
            new_sentence.append(word)
            
        else:
            new_sentence.append(word)

    new_sentence.append(last_word)
    return " ".join(new_sentence)

sentences = ['pompeii\'s fall 101: art, architecture, & archaeology in the lost city', 'Descartes\' World', 'Biomaterials']

for sentence in sentences:
    print(fix_sentence_case(sentence))


Pompeii's Fall 101: Art, Architecture, & Archaeology in the Lost City
Descartes' World World
Biomaterials


# Part IV - Combining the Pieces
Now that we've built a series of functions that are workable with our given text file, we can now combine them to create a "clean copy".

## Step 1:
Take a list of classes and append them with a "|" as in the original text file

In [424]:
def join_sentences_with_pipe(sentences):
    return '|'.join(sentences)

sentences = ['this is a sentence', 'this is also a sentence', 'this is another sentence']
print(join_sentences_with_pipe(sentences))

this is a sentence|this is also a sentence|this is another sentence


## Step 2:
Now that we've merges the classes, we can now combine the classes with the professor name. In this example, I will use the last name of the professor because it's conveniately unique.

In [427]:
def join_name_with_classes(name, classes):
    return name + '  - ' + classes

classes = 'Biology|Science of Mind'
name = 'Doe'
print(join_name_with_classes(name, classes))

Doe  - Biology|Science of Mind


## Step 3:
Now that we've cleaned the data, let's combine the data into a dictionary where the professor name is the key and the courses he/she teaches is the value.

Note: In this case, we must assume that there are duplicate rows in the CSV. A dictionary will handle this case well.

In [438]:
def build_dictionary(dictionary, name, classes):
    try:
        for a_class in classes:
            dictionary[name].append(a_class)
    except KeyError:
        dictionary[name] = classes
    return dictionary

temp_dict = {}
name = 'Doe'
classes = ['The science of mind', 'Let\'s Embrace Fear']

print(build_dictionary(temp_dict, name, classes))

name = 'Doe'
classes = ['Living life fully']

print(build_dictionary(temp_dict, name, classes))

name = 'Smith'
classes = ['It\'s just the way it is']

print(build_dictionary(temp_dict, name, classes))

{'Doe': ['The science of mind', "Let's Embrace Fear"]}
{'Doe': ['The science of mind', "Let's Embrace Fear", 'Living life fully']}
{'Smith': ["It's just the way it is"], 'Doe': ['The science of mind', "Let's Embrace Fear", 'Living life fully']}


## Conclusion:
For the final step, let's combine the functions we created throughout this tutorial to generate a document that is properly capitalized, and spelled.

In [514]:
import csv
import enchant
import re

cleaned_data_dict = {}
with open('class.txt') as csvfile:
    csv_data_dict = {}
    for row_dirty in csvfile:
        name_dirty, classes_dirty = separate_name_from_classes(row_dirty)
        classes_list_dirty = separate_classes(classes_dirty)
        name_clean = get_name(name_dirty)['last_name']
        classes_list_clean = []
        for a_class_dirty in classes_list_dirty:
#             a_class_dirty = correct_sentence_spelling(a_class_dirty) # TODO
            a_class_clean = fix_sentence_case(a_class_dirty)
            classes_list_clean.append(a_class_clean)
        cleaned_data_dict = build_dictionary(cleaned_data_dict, name_clean, classes_list_clean)
#         classes_clean = join_sentences_with_pipe(classes_list_clean)
#         row_clean = join_name_with_classes(name_clean, classes_clean)
#         print(row_clean)
print(cleaned_data_dict)
            
            

{'Siber': ['Reading & Research', '3d Photography\n Photography\n', 'Cell & Molecular Biology\n'], 'Vinella': ['Topics in Computer Vision', 'Software System Design', 'Intro to Combinatorial Optimization\n'], 'Kanich': ['Computer Networks Networks', 'Nineteenth-century British Novel', 'Accelerated Introduction to Computer Science\n'], 'Walden': ['Topics in Ecology and Evolutionary Biology', 'Back to the Future: Nostalgia & Futurity in Contemporary Sci-Fi Tv & Telefantasy\n', 'Special Topicz in Advanced Algorithms', 'Environmental History History', 'Introduction to Systems Programming', 'Topics in 3d Game Engine Development', 'Human Factors and User Interface Design\n', 'Hilbert Spaces & Their Applications', 'Interdisciplinary Scientific Visualization\n'], 'Sloan': ['Introduction to Computational Linguistics\n'], 'Yu': ['Special Topics in Advanced Algorithms\n', 'Altered States States', 'Applied Artifical Intelligence\n', 'Computers and Human Values\n'], 'Marai': ['2d Game Engines', 'Temp