# Authors: Abhinav Sharma, Conoly Cravens, and Christian Alfanso

## Document Setup

In [1]:
import pandas as pd
from pandas import Series, DataFrame

In [2]:
import numpy as np
import random
from statistics import *

In [3]:
import re
import math

## 1. Computer -  Rock Paper Scissors

### Game Functions

#### COMPUTER BRAINS - make decision off of human's historic choices. 

In [4]:
def computer_play(choices,human_choices):
    """This function serves as the computer 'brains.' It decides what the computer plays'"""
    
    #if there is no historic data, computer chooses at random
    if sum(human_choices) == 0:
        play = random.choice(choices)
    
    #if there is historic data on human plan, computer will choose based on that
    
    else:
        max_value = max(human_choices)     #what is the most common human choice so far
        
        #NOTE: human_choices list = [rock,paper,scissors]
        #make list of indexes of options the human has chosen most often
        human_guess = [i for i in range(len(human_choices)) if human_choices[i] == max_value]
        
        #of the most popular historic choices, computer chooses one at random to 'play' against
        human_guess = random.choice(human_guess)
        
        #computer plays what they think will win
        if human_guess == 0:
            play = 'paper'
        elif human_guess == 1:
            play = 'scissors'
        else:
            play = 'rock'
            
    #function returns computer choice
    return play

#### HUMAN PLAY - function asks human what they want to play (and validates it)

In [5]:
def person_play():
    """This function asks and validate the human choice"""
    
    #trigger for while loop
    player = False
    
    
    while player == False:
        play = input("Rock, Paper, Scissors? or Quit?")     #asks user if they want to play or quit
        play = play.lower()     #takes user input and makes everything lowercase
        
        #validation
        if play not in ('rock','paper','scissors','quit'):
            print('Invalid Input. Try again!')     #if user did not input rock, paper, scissors, or quit - send error
        else:
            player = True     #if input is valid, exit while loop
    
    #function returns human choice
    return play

#### RECORD FUNCTIONS - these two functions record game historics

In [6]:
#record human choice to feed into computer brains in list form
#NOTE: human choice lists = [rock,paper,scissors]
def record_human_choice(choice,choice_list):
    if choice == 'rock':
        choice_list[0]+=1
    elif choice == 'paper':
        choice_list[1]+=1
    else:
        choice_list[2]+=1
    
    #function returns updated list
    return choice_list

#Keep tracks of game winner in list form
#NOTE: winner list = [total games, tie game, human wins, computer wins]
def record_game_winner(computer,human,winner_list):
    winner_list[0]+= 1     #no matter what the outcome is, we need to add one to total games
    
    #if human and computer chose the same, it's a tie!
    if computer == human:
        winner_list[1]+=1
        print("It's a draw! We both chose",human)
       
    #if human chooses rock...
    elif human == 'rock':
        if computer == 'paper':     #computer wins if it chose paper
            winner_list[3]+=1
            print("I win!",computer,'covers',human)
            
        else:
            winner_list[2]+=1     #otherwise, human wins (becuase tie game has already been taken care of)
            print("You win!",human,'smashes',computer)
    
    #if human chooses scissors
    elif human == 'scissors':
        
        if computer == 'rock':     #computer wins if it chose rock
            winner_list[3]+=1
            print("I win!",computer,'smashes',human)
            
        else:
            winner_list[2]+=1      #otherwise, human wins (because tie game has already been taken care of)
            print("You win!",human,'cuts up',computer)
    
    #if human chooses paper...
    elif human == 'paper':
        if computer == 'scissors':     #computer wins if it chose scissors 
            winner_list[3]+=1
            print("I win!",computer,'cuts up',human)
            
        else:
            winner_list[2]+=1      #oterwise, human wins (becuase tie game has already been taken care of)
            print("You win!",human,'covers',computer)
    
    #function returns winning list
    return winner_list

#### Main Function

In [7]:
def main():
    
    #set controls - possible choices, historic human choices, game historics
    possible_choices = ['rock','paper','scissors']
    human_choices = [0,0,0] #rock,paper,scissors
    game_count = [0,0,0,0] #total, tie, computer, human
    
    #trigger for while loop
    again = True
    
    #we go until human says to quit
    while again:
        
        #computer chooses using function
        computer_choice = computer_play(possible_choices,human_choices)
        
        #human chooses option
        human_choice = person_play()
        
        #if human says to quit, we print summary and end game
        if human_choice == 'quit':
            print('We played',game_count[0],'games and you won',game_count[2])
            again = False
        
        #otherwise, we record game winner + choice for game to feed into computer brains
        else:
            record_game_winner(computer_choice,human_choice,game_count)
            record_human_choice(human_choice,human_choices)

#calls main function
main()

Rock, Paper, Scissors? or Quit?Rock
You win! rock smashes scissors
Rock, Paper, Scissors? or Quit?Paper
It's a draw! We both chose paper
Rock, Paper, Scissors? or Quit?Paper
It's a draw! We both chose paper
Rock, Paper, Scissors? or Quit?Rock
You win! rock smashes scissors
Rock, Paper, Scissors? or Quit?Scissors
You win! scissors cuts up paper
Rock, Paper, Scissors? or Quit?Quit
We played 5 games and you won 3


## 2. Voters in Florida

In [8]:
#Reads HTML file into an object 
fl_file = open("FloridaVoters.html", "r")
txt_file = fl_file.read()
fl_file.close()

##### Find pattern of table in HTML text
*Pattern Example*
<tr>
<td>ALACHUA</td>
<td>47,329</td>
<td>77,996</td>
<td>3,864</td>
<td>34,116</td>
<td>163,305</td>
</tr>

*Regex Pattern:*
<tr>                    #START
<td>[\w\s\.-]* </td>    #Country Name - note potential - and .
<td> [\w,]* </td>       #REPUBLICAN - note the comma
<td>[\w,]*</td>         #DEMOCRAT - note the comma
<td>[\w,]*</td>         #MINOR - note the comma
<td>[\w,]*</td>         #NONE - note the comma
<td>[\w,]*</td>         #TOTAL - note the comma
</tr>                   #END

In [9]:
txt_filt = re.findall(r'<tr>\n<td>[\w\s\.-]*</td>\n<td>[\w,]*</td>\n<td>[\w,]*</td>\n<td>[\w,]*</td>\n<td>[\w,]*</td>\n<td>[\w,]*</td>\n</tr>', txt_file)

In [10]:
#Can use this to check number of 'items' it finds in in text file
#len(txt_filt)

In [11]:
#Can use this to demonstrate one of the items in the list we found
#txt_filt[0]

#### Breaks out the table into lists of countries, republicans and democrats

In [12]:
txt_filt_county = [re.findall(r'<tr>\n<td>[\w\s\.-]*</td>',element) for element in txt_filt]
#len(txt_filt_county)

In [13]:
txt_filt_county2 = [re.findall(r'<tr>\n<td>[\w\s\.-]*</td>',element) for element in txt_filt]
#txt_filt_county2[0:10]

In [14]:
#brings each county data into a list
txt_filt_values = [re.findall(r'/td>\n<td>[\w,]*<',element) for element in txt_filt]
#print(txt_filt_values)

#### Create lists for republican and democrat voter counts

In [15]:
txt_filt_rep = []
txt_filt_dem = []

for i in range(len(txt_filt_values)):    #for every element in values list (aka every county)
    txt_filt_rep.append(txt_filt_values[i][0])     #add the first data value (republican voters) to republican list
    txt_filt_dem.append(txt_filt_values[i][1])     #add the second data value (democrats voters) to democrat list

In [16]:
#For check - showing first 10 republican voter counts in list
#txt_filt_rep[0:10]

#### CLEAN STRING FUNCTION - pull out digits and ignore commas

In [17]:
def cleanStrings(original_list):
    """This function cleans the voter numbers so they are recorded as integers"""
    
    #take original list and find all the digits (note: each digit will be a different element in a list)
    clean_list = [re.findall('\d', element) for element in  original_list]
    
    #join elements since digits will be extracted separately
    joined_list = [''.join(element) for element in clean_list]
    
    #make sure every number is an integer
    for i in range (0,len(joined_list)):
        joined_list[i] = int(joined_list[i])
        
    #return join list
    return joined_list

#define republican and democrat final lists
rep_list=cleanStrings(txt_filt_rep)
dem_list=cleanStrings(txt_filt_dem)

In [18]:
#Can use to check first ten elements in republican list
#rep_list[0:10]

In [19]:
#Can use to check first ten elements in democrat list
#dem_list[0:10]

#### Clean County Names

In [20]:
#First, take out tags at beginning & end of line
txt_filt_county_clean = [re.sub("</?t.>","",str(element)) for  element in txt_filt_county]
#FOR HELP: txt_filt_county_clean[0:10]

In [21]:
#Second, take out \\n
txt_filt_county_clean2 = [re.sub(r'[\[\]\\n\']*',"",element) for  element in txt_filt_county_clean]

In [22]:
#To check
#txt_filt_county_clean2[0:10]

#### Combine to a list of tuples and sort based on democrat values

In [23]:
full_list = list(zip(txt_filt_county_clean2,rep_list,dem_list))
sorted_full_list = sorted(full_list, key = lambda x: x[2]) 
#sorted_full_list[0:5]

#### Create table

In [24]:
df = DataFrame(sorted_full_list,columns=["County","Republican","Democratic"])

#To take out total
df.loc[df['County'] != "Total"]

Unnamed: 0,County,Republican,Democratic
0,LAFAYETTE,1373,2672
1,GLADES,2190,3110
2,LIBERTY,720,3372
3,UNION,2752,3579
4,GILCHRIST,5789,3652
...,...,...,...
62,ORANGE,206174,303458
63,HILLSBOROUGH,257436,314265
64,PALM BEACH,245452,367236
65,MIAMI-DADE,362161,539367


## 3. Near Duplicate Detection

In [25]:
#Open and read tweets as text
santa_file = open("Santa.txt", "r")
santa_text = santa_file.readlines()
santa_file.close()

### (a) Convert each tween into a dictionary of phrases

In [26]:
def moving_window(tweet):
    """This function converts each tweet into a dictionary of phrases"""
    
    #Blank Dictionary
    new_dict = {}
    
    #Clean & Split Tweet List
    tweet_cleaned = re.sub(r"\s[^\w]+\s"," ",str(tweet))
    split_list = tweet_cleaned.strip().split(' ')
    
    #Create groups of 3 phrases
    split_list_2 =[y for y in [split_list[i:i+3] for i in range(0, len(split_list)-2)]]
    split_list_2_joined = [' '.join(element) for element in split_list_2]
    
    #Add to dictionary
    new_dict = {key:value for key, value in zip(list(set(split_list_2_joined)),[1]*len(list(set(split_list_2_joined))))}
    
    #Return dictionary
    return new_dict 

In [27]:
#TO CHECK
#moving_window(santa_text[4])

### (b) Calculate similarty between two tweets

In [28]:
def cosine(dict1,dict2):
    numerator = 0
    denominator = 0
    
    #Tally number of matches in tweet
    for i in dict1.keys():
        if i in dict2.keys():
            numerator+=1
    
    #Calculate demoninator
    n1 = len(list(dict1.keys()))
    n2 = len(list(dict2.keys()))
    denominator = math.sqrt(n1*n2)
    
    #Return similarity
    return  numerator/denominator 

In [29]:
#CHECK
#cosine(moving_window(santa_text[1]),moving_window(santa_text[3]))

### (c) Read in Tweets and output near-duplicates

In [30]:
#clean list of tweets
santa_text_clean = [re.sub(r"\s[^\w]+\s"," ",str(element)) for element in santa_text]

In [31]:
#For output header
print('List of Similar Tweets:\nNote: tweet number is NOT index\n')

#For every tweet in cleaned list of tweets...
for i in range(len(santa_text_clean)):
    #compared to all previous tweets
    for j in range(i):
        
        dict_1=moving_window(santa_text_clean[i])
        dict_2=moving_window(santa_text_clean[j])
        
        cos=cosine(dict_1,dict_2)
        if cos> 0.5:
            print("Tweet",i+1,"and","Tweet",j+1,"are similar. Their similarty (cosine) score is ",cos,"\n")
        else:
            continue


List of Similar Tweets:
Note: tweet number is NOT index

Tweet 3 and Tweet 1 are similar. Their similarty (cosine) score is  0.8593378488473195 

Tweet 4 and Tweet 2 are similar. Their similarty (cosine) score is  0.628970902033151 

Tweet 5 and Tweet 2 are similar. Their similarty (cosine) score is  0.6837634587578276 

Tweet 5 and Tweet 4 are similar. Their similarty (cosine) score is  0.8362420100070908 



## 4. The Google of Quotes

In [32]:
#Open and read file
quote_file=open("quotes.txt","r")
quote_text = quote_file.readlines()

In [33]:
#NOTE: Assumption : Given quote and speaker - len will always be even
#len(quote_text)
#len([re.sub(r"\n","",str(element)) for element in quote_text])

### (a) Build a list of full quotes

In [34]:
#Take out \n
quote_text = [re.sub(r"\n","",str(element)) for element in quote_text]

In [35]:
#Combine list
paired_list = [quote_text[i] +" - "+ quote_text[i+1] for i in range(0, len(quote_text)-1, 2)]
print(paired_list[0])

#TO FIX ASSUMPTION
#if len(quote_text) % 2 == 1:
    #paired_list.append(string_list[len(quote_text)-1])

How we spend our days is, of course, how we spend our lives. - Annie Dillard


In [36]:
#Copies paired list to remove punctuations etc.
#quotes_list = [paired_list[i] for i in range(0, len(paired_list)-1)]
#quotes_list = [re.sub(r"[`\'\.,-\?%;!]","",str(element).lower()) for element in quotes_list]

#### Function that takes a full quote as argument and outputs a list of the words

In [37]:
def quote_words(quote):
    
    #Need to take away punction
    quote = re.sub(r"[`\'\.,-\?%;!]","",str(quote).lower())
                    
    word_list = quote.split()
    #Without regex of whitespaces, python strips multiple whitespaces
    return word_list

In [38]:
#CHECK
#quote_words(paired_list[4])

#freq_dict = {key:value for key,value in zip(list(set(quote_words(paired_list[1]))),[0]*len(list(set(quote_words(paired_list[1])))))}
#print(freq_dict)

### (c) Buid the postings-list dictionary

#### Function that will make dictionaries that will serve as the values in the posting-list dictionary

In [39]:
#Have list for keys, Need list of dictionaries as values
def Frequency(word_list):
    """This function tallies the word frequency in each quote"""
    
    # Creating an dictionary with unqiue words as keys
    freq_dict = {key:value for key,value in zip(list(set(word_list)),[0]*len(list(set(word_list))))}
    
    #for every word in the list, add 1 to the value
    for item in word_list:
        freq_dict[item] += 1
    
    #return dictionary
    return freq_dict

#### Create a list of dictionaries

In [40]:
dictionary_list=[Frequency(quote_words(element)) for element in paired_list]

In [41]:
#CHECK
#dictionary_list[0:2]

In [42]:
#Now  we have lists for  both key and value  
postings_dict = {key:value for key,value  in zip(paired_list,dictionary_list)}

In [43]:
#CHECK
#postings_dict

### (d) Build the reverse postings-list dictionary

#### Function that formulates a list of all unique words in all quotes

In [44]:
def getUniqueWords(words):
    """This function gets a list of every unique word in all of the quotes"""
    
    #Blank list
    unique_words = [] 
    
    #For every word in every quote, only write word to unique words list if not already there
    for i in words:
        if not i in unique_words:
            unique_words.append(i)
    
    #return unique words
    return unique_words

##### Now making list of every word in every quote to feed the getUniqueWords function

In [45]:
all_words_list=[]

#for every quote...
for i in range(len(paired_list)):
    all_words_list += quote_words(paired_list[i])

##### Now put those two together...

In [46]:
uniq_words_list = getUniqueWords(all_words_list)

In [47]:
#quotes_list_cleaned[0]

#### Now, build a function that counts the number of times a word appears in the quote

In [48]:
def countX(lst, x):
    """This function counts the number of times a word appears in the quote"""
    count = 0
    for ele in lst:
        if ele == x:
            count += 1
    return count

#### Function that creates a frequency dictionary

In [49]:
def Frequency_rev(uniq_words_list,paired_list):
    
    # Creating two dictionaries: first of all unique words as keys, second with all the quotes as keys
    freq_dict = {key:value for key,value in zip(uniq_words_list,[0]*len(uniq_words_list)) }
    freq_dict2 = {key:value for key,value in zip(paired_list,[0]*len(paired_list)) }
    
    #For every unique word...
    for item in uniq_words_list:
        
        #Look at each quote...
        for j in paired_list:
            
            #And count how many time unique word is in there
            if item in quote_words(j):
                freq_dict2[j] = countX(quote_words(j),item)
            else:
                freq_dict2[j] = 0
            
            #create dictionary of all quotes that show up
            freq_dict2_up = {key:value for key,value in freq_dict2.items() if value != 0}
        
        #set dictionary of all quotes that have word in it as value of frequency dictionary
        freq_dict[item] = freq_dict2_up
    
    #return dictionary
    return freq_dict

In [50]:
#CHECK
#Frequency_rev(uniq_words_list[0:200],paired_list[1:50])    

In [51]:
#postings_dict

In [52]:
full_rev_dict = Frequency_rev(uniq_words_list,paired_list)

### (e) Write a TF-IDF function

In [53]:
word_='diplomat'
quote_ = 'A diplomat is a man who always remembers a woman`s birthday but never remembers her age. - Robert Frost'

In [54]:
def  tfidfFunc(word,quote):
    
    tf = postings_dict[quote][word]/max(list(postings_dict[quote].values()))
    idf = math.log(len(paired_list)/len(list(full_rev_dict[word].keys())))
    tf_idf = tf*idf
    return tf_idf

In [55]:
tfidfFunc(word_,quote_)

2.2622389835350267

### (f) Quote search using a single word

##### Function that takes a quote as an argument and returns a dictionary whose keys are full quotes containing that word, and whose values are the TF_IDF score of that word for that full quote.

In [56]:
def quote_search(word_input):
    
    search_dict = full_rev_dict[word_input]
    
    for k in search_dict.keys():
        search_dict[k] = tfidfFunc(word=word_input,quote = k)
    return search_dict

In [57]:
quote_search(word_)

{'A diplomat is a man who always remembers a woman`s birthday but never remembers her age. - Robert Frost': 2.2622389835350267}

### (g) Quote Search using multiple words

In [58]:
# Add two dicts
def addDicts(dict1,dict2):
    """This function adds two dictionaries together"""
    d3 = dict(dict1)
    d3.update(dict2) 
    for i, j in dict1.items():
        for x, y in dict2.items():
            if i == x:
                d3[i]=(j+y)
    return(d3)


In [59]:
def quote_search_multiple(word_list):
    """This function searches multiple words"""
    list_of_dictionaries = [quote_search(element) for element in word_list]
    temp_dict = {}
    for i in range(len(list_of_dictionaries)):
        temp_dict = addDicts(temp_dict,list_of_dictionaries[i])
    return(temp_dict)

In [60]:
word_list_ = ['george','was','always','a','diplomat']


In [61]:
quote_search_multiple(word_list=word_list_)

{'It is necessary for the welfare of society that genius should be privileged to utter sedition, to blaspheme, to outrage good taste, to corrupt the youthful mind, and generally to scandalize one`s uncles. - George Bernard Shaw': 0.624631060895087,
 'Success does not consist in never making mistakes but in never making the same one a second time. - George Bernard Shaw': 2.0676603547680537,
 'All glory is fleeting. - George Patton': 3.1231553044754348,
 'Associate yourself with men of good quality if you esteem your own reputation; for `tis better to be alone than in bad company. - George Washington': 3.1231553044754348,
 'Beauty is all very well at first sight; but whoever looks at it when it has been in the house three days? - George Bernard Shaw': 1.5615776522377174,
 'Fanaticism consists of redoubling your efforts when you have forgotten your aim. - George Santayana': 1.5615776522377174,
 'Government is not reason; it is not eloquent; it is force. Like fire, it is a dangerous servan