## WIM Python Textmining workshop: 2020-10-22
### Helge Marahrens (hmarahre@iu.edu)
### Part 1: Basic String Manipulation

In [1]:
# // import packages
from nltk import word_tokenize
from nltk.corpus import stopwords
from fuzzywuzzy import fuzz
from collections import Counter, defaultdict
from itertools import combinations 

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt



In [2]:
# string basics
string_1 = 'Hello, '         #   create a new string
string_2 = 'world.'

In [3]:
# lists are ordered sequences of items e.g., strings.
list_1 = [string_1, string_2]

In [4]:
# lists can be accessed via indexing and slicing
print(list_1[0])
list_1.append("How are you?")
print(list_1[1:])

Hello, 
['world.', 'How are you?']


In [5]:
for i, string in enumerate(list_1):
    print(str(i) + ": " + string)

0: Hello, 
1: world.
2: How are you?


In [6]:
# strings are ordered sequences of letters
for i, letter in enumerate(string_1):
    print(str(i) + ": " + letter)

0: H
1: e
2: l
3: l
4: o
5: ,
6:  


In [7]:
# just like lists, strings are of a particular length 
print(len(string_1))

7


In [8]:
# case sensitive
print(string_1.lower())
print(string_1.lower() == string_1)
print(string_1.strip(" "))
print(string_1.strip(" ") == string_1)
print(string_2.capitalize())
print(string_2.upper())

hello, 
False
Hello,
False
World.
WORLD.


In [9]:
# Counter objects count how often a particular
# item appears in a sequence
print(Counter(string_1))

Counter({'l': 2, 'H': 1, 'e': 1, 'o': 1, ',': 1, ' ': 1})


In [10]:
# concetenate strings
print(string_1 + string_2)

Hello, world.


In [11]:
# fuzzy string matching
# films to compare
films = ['eternal sunshine',
 'eternal sunshine of the spotless mind',
 'eternal sunshine of a spotless mind',
 'eternalsunshine of the spotless mind',
 'the eternal sunshine of the spotless mind',
 'eternal sunshine on the spotless mind',
 'o brother where art thou',
 'o brother , where art thou?',
 'oh brother where art thou']

# creaete comparison dataframe
comp_df = pd.DataFrame("", index=films, columns=films)
comp_df.head()

Unnamed: 0,eternal sunshine,eternal sunshine of the spotless mind,eternal sunshine of a spotless mind,eternalsunshine of the spotless mind,the eternal sunshine of the spotless mind,eternal sunshine on the spotless mind,o brother where art thou,"o brother , where art thou?",oh brother where art thou
eternal sunshine,,,,,,,,,
eternal sunshine of the spotless mind,,,,,,,,,
eternal sunshine of a spotless mind,,,,,,,,,
eternalsunshine of the spotless mind,,,,,,,,,
the eternal sunshine of the spotless mind,,,,,,,,,


In [12]:
# find the exact similarity between films
comb = list(combinations(films, 2))
for comparison in comb:
    similarity = int(comparison[0] == comparison[1])
    comp_df.loc[comparison[1], comparison[0]] = similarity
    
comp_df.head()

Unnamed: 0,eternal sunshine,eternal sunshine of the spotless mind,eternal sunshine of a spotless mind,eternalsunshine of the spotless mind,the eternal sunshine of the spotless mind,eternal sunshine on the spotless mind,o brother where art thou,"o brother , where art thou?",oh brother where art thou
eternal sunshine,,,,,,,,,
eternal sunshine of the spotless mind,0.0,,,,,,,,
eternal sunshine of a spotless mind,0.0,0.0,,,,,,,
eternalsunshine of the spotless mind,0.0,0.0,0.0,,,,,,
the eternal sunshine of the spotless mind,0.0,0.0,0.0,0.0,,,,,


In [13]:
# fuzzy string matching
fuzz.ratio("this is a test", "this is a test!")

# find the exact similarity between films
for comparison in combinations(films, 2):
    similarity = fuzz.ratio(comparison[0], comparison[1])
    comp_df.loc[comparison[1], comparison[0]] = similarity
    
comp_df

Unnamed: 0,eternal sunshine,eternal sunshine of the spotless mind,eternal sunshine of a spotless mind,eternalsunshine of the spotless mind,the eternal sunshine of the spotless mind,eternal sunshine on the spotless mind,o brother where art thou,"o brother , where art thou?",oh brother where art thou
eternal sunshine,,,,,,,,,
eternal sunshine of the spotless mind,60.0,,,,,,,,
eternal sunshine of a spotless mind,63.0,94.0,,,,,,,
eternalsunshine of the spotless mind,58.0,99.0,93.0,,,,,,
the eternal sunshine of the spotless mind,56.0,95.0,89.0,94.0,,,,,
eternal sunshine on the spotless mind,60.0,97.0,92.0,96.0,92.0,,,,
o brother where art thou,30.0,36.0,27.0,33.0,37.0,36.0,,,
"o brother , where art thou?",28.0,34.0,26.0,32.0,35.0,34.0,94.0,,
oh brother where art thou,29.0,35.0,27.0,33.0,36.0,35.0,98.0,92.0,


### Part 2: Regular Expressions

In [14]:
# // import packages
import re
from collections import Counter
from nltk.tokenize import sent_tokenize, word_tokenize
import pandas as pd

In [15]:
# tokenize
word_tokenize("How are you?")

['How', 'are', 'you', '?']

In [16]:
# regular expression to match the ending of sentences
sentence_endings = r"[.?!]"
sent_1 = "Voting for the 91st Academy Awards ends on Tuesday."
sent_2 = "Roughly 8,200 movie industry insiders use a private website to mark their choices."
sent_3 = "Just what goes through their minds when they point and click?"

sent_con = sent_1 + " " + sent_2 + " " + sent_3
print(re.split(sentence_endings, sent_con))

['Voting for the 91st Academy Awards ends on Tuesday', ' Roughly 8,200 movie industry insiders use a private website to mark their choices', ' Just what goes through their minds when they point and click', '']


In [17]:
# regular expression to match capitalized words
capitalized_words = r"[A-Z]\w+"
print(re.findall(capitalized_words, sent_con))

['Voting', 'Academy', 'Awards', 'Tuesday', 'Roughly', 'Just']


In [18]:
# regular expression to split strings on whitespace
spaces = r"[\s]+"
print(re.split(spaces, sent_con))
my_tokens = [token.lower() for token in re.split(spaces, sent_con)\
             if token not in ""]

['Voting', 'for', 'the', '91st', 'Academy', 'Awards', 'ends', 'on', 'Tuesday.', 'Roughly', '8,200', 'movie', 'industry', 'insiders', 'use', 'a', 'private', 'website', 'to', 'mark', 'their', 'choices.', 'Just', 'what', 'goes', 'through', 'their', 'minds', 'when', 'they', 'point', 'and', 'click?']


In [19]:
# regular expression to split strings on digits
digits = r"[\d]+"
print(re.findall(digits, sent_con))

digits = r"[\d]+[,]*[\d]*"
print(re.findall(digits, sent_con))

['91', '8', '200']
['91', '8,200']


In [20]:
# tokenization
print(my_tokens)
tokens = word_tokenize(sent_con)
unique_tokens = set(tokens)
print(unique_tokens)
print(Counter(tokens))
print(Counter(tokens).most_common(3))
sentences = sent_tokenize(sent_con)

['voting', 'for', 'the', '91st', 'academy', 'awards', 'ends', 'on', 'tuesday.', 'roughly', '8,200', 'movie', 'industry', 'insiders', 'use', 'a', 'private', 'website', 'to', 'mark', 'their', 'choices.', 'just', 'what', 'goes', 'through', 'their', 'minds', 'when', 'they', 'point', 'and', 'click?']
{'the', '91st', 'Tuesday', 'Academy', '?', 'point', 'their', 'website', 'when', 'and', 'industry', 'Roughly', 'movie', 'mark', 'through', 'Awards', 'private', 'minds', 'Voting', 'they', 'on', 'a', 'for', 'to', '.', 'choices', 'use', 'click', '8,200', 'Just', 'ends', 'what', 'insiders', 'goes'}
Counter({'.': 2, 'their': 2, 'Voting': 1, 'for': 1, 'the': 1, '91st': 1, 'Academy': 1, 'Awards': 1, 'ends': 1, 'on': 1, 'Tuesday': 1, 'Roughly': 1, '8,200': 1, 'movie': 1, 'industry': 1, 'insiders': 1, 'use': 1, 'a': 1, 'private': 1, 'website': 1, 'to': 1, 'mark': 1, 'choices': 1, 'Just': 1, 'what': 1, 'goes': 1, 'through': 1, 'minds': 1, 'when': 1, 'they': 1, 'point': 1, 'and': 1, 'click': 1, '?': 1})
[(

In [21]:
# find working directory
import os
os.getcwd()

'C:\\Users\\H_Mar'

In [22]:
# cities dataset
path = "" # your working directory
          # e.g. C:/Users/username/Desktop/2020-02-14/Text Mining
          # important: use forward slashes: /
#os.chdir(path)
city_df = pd.read_csv('cities.csv')
city_df.head()

Unnamed: 0,city,latlon
0,"Fairless Hills (USA, PA)",('40.17888889000000318674210575409233570098876...
1,Blacktown (Australia),('-33.7711111099999996554288372863084077835083...
2,"Winter Park (USA, FL)",('28.59611110999999894488610152620822191238403...
3,Ajman (United Arab Emirates),('25.41361111000000150283995026256889104843139...
4,Medicine Hat (Canada),('50.04166666999999790732545079663395881652832...


In [23]:
# regular expressions to match inside parentheses
matcher_USA = re.compile(r'\(USA,\s*(.*)\)')
matcher_other = re.compile(r'\((.*)\)')

In [24]:
# test city
test_cities = city_df.iloc[0:2,0].tolist()
test_cities
print(test_cities)
matcher_USA.search(test_cities[0])

['Fairless Hills (USA, PA)', 'Blacktown (Australia)']


<_sre.SRE_Match object; span=(15, 24), match='(USA, PA)'>

In [25]:
print(matcher_USA.search(test_cities[1]))

None


In [26]:
print(matcher_other.search(test_cities[1]))
print(matcher_other.search(test_cities[1]).group(0))

<_sre.SRE_Match object; span=(10, 21), match='(Australia)'>
(Australia)


In [27]:
# define function to pull country from city name
    # country is in parentheses
def find_country(city_name):
    if matcher_USA.search(city_name) != None:
        country = 'USA'
    else:
        country = matcher_other.search(city_name).\
                     group(0)[1:-1]
    return country

In [28]:
# apply function to city column
city_df['country'] = city_df['city'].apply(find_country)
city_df.loc[:,['city','country']].head()

Unnamed: 0,city,country
0,"Fairless Hills (USA, PA)",USA
1,Blacktown (Australia),Australia
2,"Winter Park (USA, FL)",USA
3,Ajman (United Arab Emirates),United Arab Emirates
4,Medicine Hat (Canada),Canada


In [29]:
# what is the frequency of each country in the sample?
city_df['country'].value_counts().head(10)

USA               60
Germany           16
France            14
Japan             13
United Kingdom     9
Canada             6
Netherlands        6
Switzerland        4
Australia          4
Ireland            3
Name: country, dtype: int64

### Part 3: Sentiment Analysis

In [30]:
# // import packages
import pandas as pd
from glob import glob
import re
from nltk import word_tokenize
from collections import defaultdict
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF

In [31]:
# this code goes into the working directory and grabs all filenames
# ending with "txt"
extension = "txt"
film_files = [i for i in glob("films/" + '*.{}'.format(extension))]

In [32]:
# loop through the .txt files and add their content
# to a dictionary
film_dict = defaultdict(list)
for i, film_file in enumerate(film_files):
    with open(film_file, 'r') as document:
        content = document.readlines()
        film_dict[i] =\
            [film_file,
             content[0].strip("\n"),
             content[4].strip("\n"),
             content[8].strip("\n")]
film_dict[0]

['films\\10 Things I Hate About You.txt',
 'Comedy',
 'Comedy, Drama, Romance',
 "A pretty, popular teenager can't go out on a date until her ill-tempered older sister does."]

In [33]:
# create dataframe
plot_df = pd.DataFrame.from_dict(film_dict,
                    orient='index',
                    columns = ['file', 'genre', 'genre_full', 'plot'])
plot_df.head()

Unnamed: 0,file,genre,genre_full,plot
0,films\10 Things I Hate About You.txt,Comedy,"Comedy, Drama, Romance","A pretty, popular teenager can't go out on a d..."
1,films\24 Hour Party People.txt,Comedy,"Biography, Comedy, Drama","In 1976, Tony Wilson sets up Factory Records a..."
2,films\300.txt,Action,"Action, Fantasy",King Leonidas of Sparta and a force of 300 men...
3,films\50 First Dates.txt,Comedy,"Comedy, Drama, Romance",Henry Roth is a man afraid of commitment up un...
4,films\500 Days of Summer.txt,Comedy,"Comedy, Drama, Romance",An offbeat romantic comedy about a woman who d...


In [34]:
# sentiment analysis

# we will use the AFIN dictionary
# Nielsen 2011 - A new ANEW. Evaluation of a word list for
#                sentiment analysis in microblogs
AFIN = pd.read_csv(path + "AFIN.csv", index_col=0)
print(AFIN.head(10))

            word  score
1   breathtaking      5
2         hurrah      5
3    outstanding      5
4         superb      5
5       thrilled      5
6        amazing      4
7        awesome      4
8      brilliant      4
9       ecstatic      4
10      euphoric      4


In [35]:
# the sentiment analysis
def sentiment(text):
    raw = 0
    nr_sent_words = 0
    sentiment_score = 0
    list_of_words = []
    
    clean_text = text.lower()
    clean_text = word_tokenize(clean_text)
    for word in clean_text:
        if word in AFIN['word'].to_list():
            nr_sent_words += 1
            raw += AFIN.loc[AFIN['word'] == word, 'score'].values[0]
    if nr_sent_words != 0:
        sentiment_score = raw/nr_sent_words
    return sentiment_score

In [36]:
# positive sentiment
sentiment("Awesome! I am thrilled that it's Friday.")

4.5

In [37]:
# find sentiment in plots
plot_df['polarity'] = plot_df['plot'].apply(sentiment)


# summarize sentiment by genre
plot_df.groupby('genre').polarity.describe().round(2)

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
genre,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Action,144.0,-0.4,1.57,-3.0,-1.62,0.0,0.5,4.0
Comedy,534.0,0.03,1.47,-3.0,-1.0,0.0,1.0,3.0


In [38]:
# find the most positive plot
plot_df.sort_values('polarity', ascending=False, inplace=True)
plot_df.head(10)

Unnamed: 0,file,genre,genre_full,plot,polarity
391,films\Raiders of the Lost Ark.txt,Action,"Action, Adventure","In 1936, archaeologist and adventurer Indiana ...",4.0
372,films\Pirate Radio.txt,Comedy,"Comedy, Drama, Music","A band of rogue DJs that captivated Britain, p...",3.0
639,films\Vuxna människor.txt,Comedy,Comedy,Frank leads a respectable yuppie life working ...,3.0
560,films\The Match.txt,Comedy,"Comedy, Romance, Sport",Two Scottish pub football teams play each othe...,3.0
365,"films\Paris, je t'aime.txt",Comedy,"Comedy, Drama, Romance","Through the neighborhoods of Paris, love is ve...",3.0
635,films\Up.txt,Comedy,"Animation, Adventure, Comedy",Seventy-eight year old Carl Fredricksen travel...,3.0
165,films\Dutch.txt,Comedy,"Comedy, Drama","To get to know his girlfriend's son, a working...",3.0
574,films\The Quiet Man.txt,Comedy,"Comedy, Drama, Romance",A retired American boxer returns to the villag...,3.0
256,"films\I Love You, Man.txt",Comedy,"Comedy, Romance",Friendless Peter Klaven goes on a series of ma...,3.0
470,films\Superman.txt,Action,"Action, Adventure, Drama",An alien orphan is sent from his dying planet ...,3.0


In [39]:
plot_df.iloc[0,3]

"In 1936, archaeologist and adventurer Indiana Jones is hired by the U.S. government to find the Ark of the Covenant before Adolf Hitler's Nazis can obtain its awesome powers."

### Part 4: Dimension Reduction (Non-Negative Matrix Factorization)

In [40]:
# dimension reduction 

# create a function to clean text
# part of cleaning the text is deleting stopwords
sw_english = stopwords.words('english')
sw_english[0:10]

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]

In [41]:
# another part is stemming words
ps = PorterStemmer()

In [42]:
# create clean_text function
def clean_text(text):
    clean_text = text.lower()
    clean_text = re.sub(r'[,.:;?!&]', '', clean_text)
    clean_text = word_tokenize(clean_text)
    clean_text = [word for word in clean_text if word not in sw_english]
    clean_text = [ps.stem(word) for word in clean_text]
    return (clean_text)

In [43]:
test_string = plot_df.iloc[0,3]
clean_text(test_string)

['1936',
 'archaeologist',
 'adventur',
 'indiana',
 'jone',
 'hire',
 'us',
 'govern',
 'find',
 'ark',
 'coven',
 'adolf',
 'hitler',
 "'s",
 'nazi',
 'obtain',
 'awesom',
 'power']

In [44]:
# apply function to plot column
plot_df['clean_text'] = plot_df['plot'].\
                        apply(lambda x: " ".join(clean_text(x)))

# term frequency - inverse document frequency
tfidf_vectorizer = TfidfVectorizer(max_df=0.9,
                                   min_df=0.01,
                                   max_features=1000)
tfidf_matrix = tfidf_vectorizer.fit_transform(plot_df['clean_text'])
words = tfidf_vectorizer.get_feature_names()

# set the number of components
num_clusters = 5
NMF_model = NMF(n_components = num_clusters)
NMF_model.fit(tfidf_matrix)
components_df = pd.DataFrame(NMF_model.components_,
                             columns=words).\
                             transpose()

# see terms associated with first component
print(components_df.sort_values(0, ascending=False))

               0         1         2         3         4
young   1.387347  0.000000  0.000000  0.000000  0.000000
man     1.052714  0.000000  0.000000  0.000000  0.000000
find    0.819016  0.000000  0.001052  0.010774  0.131421
woman   0.621514  0.013349  0.000918  0.157279  0.000000
help    0.436571  0.001183  0.000000  0.000000  0.081170
...          ...       ...       ...       ...       ...
howev   0.000000  0.000000  0.000000  0.041000  0.052952
togeth  0.000000  0.030567  0.028754  0.017022  0.053647
town    0.000000  0.000000  0.000000  0.000000  0.389313
high    0.000000  0.000000  0.767298  0.000000  0.000000
group   0.000000  0.000000  0.013122  0.037249  0.241250

[284 rows x 5 columns]


In [45]:
# see terms associated with clusters
for column in components_df.columns:
    print(components_df.sort_values(column, ascending=False).index.tolist()[:5])

['young', 'man', 'find', 'woman', 'help']
['life', 'famili', 'live', 'film', 'stori']
['school', 'high', 'friend', 'student', 'year']
['love', 'fall', 'new', 'york', 'citi']
['two', 'tri', 'town', 'team', 'small']
