# The goal of this project was to use Latent Semantic Analysis to determine the dominant words and concepts associated with a dataset of described UFO sightings and create topic clusters.

### Data Prep

In [1]:
###Importing Packages
import pandas as pd
import numpy as np 
import os

In [2]:
###Setting Working Directory
import os
path="/Users/benroberts/Downloads/MSA-Fall1"
os.chdir(path)
os.getcwd()

'/Users/benroberts/Downloads/MSA-Fall1'

In [3]:
###Loading Dataset as Dataframe
ufo = pd.read_csv("scrubbed.csv")

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
### Subsetting Comments
ufo_com=ufo['comments']

In [143]:
###Creating list of strings for Processing
ufo_l=list(ufo_com)
type(ufo_l)
for i in range(len(ufo_l)):
    ufo_l[i]=str(ufo_l[i])
#ufo_l

In [10]:
###Importing Text Packages
import nltk
import re
import string

In [11]:
### Remove punctuation, capitals, then tokenize documents

punc = re.compile( '[%s]' % re.escape( string.punctuation ) )
term_vec = [ ]

for d in ufo_l:
    d = d.lower()
    d = punc.sub( '', d )
    term_vec.append( nltk.word_tokenize( d ) )

In [12]:
### Remove stop words from term vectors

stop_words = nltk.corpus.stopwords.words( 'english' )

for i in range( 0, len( term_vec ) ):
    term_list = [ ]

    for term in term_vec[ i ]:
        if term not in stop_words:
            term_list.append( term )

    term_vec[ i ] = term_list

In [13]:
### Porter stem remaining terms

porter = nltk.stem.porter.PorterStemmer()

for i in range( 0, len( term_vec ) ):
    for j in range( 0, len( term_vec[ i ] ) ):
        term_vec[ i ][ j ] = porter.stem( term_vec[ i ][ j ] )

In [14]:
###Detokenize term vector to Return New List of Comments

detokenized_doc = []
for i in range(len(term_vec)):
    t = ' '.join(term_vec[i])
    detokenized_doc.append(t)

ufo_new = detokenized_doc

In [15]:
### TF-IDF vectorize documents w/sklearn, remove English stop words

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import stop_words

vect = TfidfVectorizer( stop_words='english', max_features=10000 )
tfidf_matrix = vect.fit_transform( ufo_new )
tf_idf_matrix = tfidf_matrix.todense()

In [16]:
### Create list of remaining unique terms (10,000)
terms = vect.get_feature_names()

In [17]:
###TFIDF DataFrame
tfidf=pd.DataFrame(tf_idf_matrix, columns=terms)

### Latent Semantic Analysis 

In [29]:
###Latent Semantic Analysis with SVD on TF-IDF Matrix (15 concepts)

from sklearn.decomposition import TruncatedSVD

# SVD represent documents and terms in vectors 
svd_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=500, random_state=42)

document_topics=svd_model.fit_transform(tfidf_matrix)

In [247]:
###Printing Out Topics/Concepts and their most Heavily Weighted Terms
term=[]
weight=[]
yeah=abs(svd_model.components_)
for i, comp in enumerate(svd_model.components_):
    terms_comp = zip(terms, comp)
    sorted_terms = sorted(terms_comp, key= lambda x:x[1], reverse=True)[:10]
    for j in range(len(sorted_terms)):
        temp.append(sorted_terms[j][0])
    print("Topic "+str(i)+": ")

    for t in sorted_terms:
        print(t[1], t[0])
        term.append(t[0])
        weight.append(t[1])
        print(" ")

Topic 0: 
0.5385734656359412 light
 
0.3617031187922644 sky
 
0.3159090815223304 bright
 
0.25177545629379994 object
 
0.20269082186896123 orang
 
0.17267533619121195 red
 
0.17001387110671765 white
 
0.15629877953607885 shape
 
0.11044498769831661 flash
 
0.10807283075836578 fli
 
Topic 1: 
0.6076086111310213 object
 
0.35157414932409503 shape
 
0.18570067896005463 fli
 
0.12140584651786682 ufo
 
0.11306173406386788 craft
 
0.09936046875492628 like
 
0.08868659931660017 seen
 
0.08804597887504013 triangl
 
0.0819008302147879 triangular
 
0.08148463192547972 saw
 
Topic 2: 
0.6575063468490188 sky
 
0.3832803534638034 orang
 
0.19082793594180836 firebal
 
0.17800082334585365 night
 
0.090203776630442 glow
 
0.08951666773271823 ball
 
0.07980550447369046 orb
 
0.04691119850918314 object
 
0.043691598353301934 sphere
 
0.03075574630383237 strang
 
Topic 3: 
0.6891570751014322 orang
 
0.1526174809404168 ball
 
0.150473004785703 glow
 
0.14727023210372028 craft
 
0.14269762330943025 orb
 
0

In [176]:
###Dictionary of Terms and Weights for Topic 0
dicts = {}
for t in range(len(term)):
    dicts[term[t]] = weight[t]
print(dicts)

{'light': 0.5385734656359412, 'sky': 0.18503242971605582, 'bright': 0.18404182586423373, 'object': 0.13481028604443868, 'orang': 0.22600593734740937, 'red': 0.4744154397483481, 'white': 0.17195824814305613, 'shape': 0.2159812843755849, 'flash': 0.10238830651130339, 'fli': 0.0652358127940796, 'ufo': 0.45173969589627094, 'craft': 0.163988035496486, 'like': 0.36219687173586856, 'seen': 0.0940811824313973, 'triangl': 0.10855708164395664, 'triangular': 0.0324056520151527, 'saw': 0.28746447938101066, 'firebal': 0.34370831679221986, 'night': 0.06529708501590695, 'glow': 0.10987057172824335, 'ball': 0.1526174809404168, 'orb': 0.11619473084812662, 'sphere': 0.07456181613951442, 'strang': 0.03075574630383237, 'format': 0.11597383156385913, 'fast': 0.063979813658856, 'disappear': 0.06080654604680967, 'travel': 0.09526623081211516, 'speed': 0.04461121584398084, 'round': 0.03955264519978765, 'sight': 0.27834297124471286, 'note': 0.2758411540646078, 'nuforc': 0.27289009446565715, 'pd': 0.27141237985

In [179]:
###WordCloud of Highest Weighted Words for Topic 0
import matplotlib
import matplotlib.pyplot as plt
from wordcloud import WordCloud

wc = WordCloud(
    background_color="white",
    
    max_words=2000,
    width = 1024,
    height = 720,
    colormap=matplotlib.cm.inferno
)

# Generate the cloud

wc.generate_from_frequencies(dicts)

# Save the could to a file

wc.to_file("topic_all.png")

<wordcloud.wordcloud.WordCloud at 0x1a1cfb8c18>

In [32]:
###Term-Concept Matrix
pd.set_option('display.max_rows', 5000)
pd.set_option('display.max_columns', 5000)
concept_term_mat=pd.DataFrame(np.round(svd_model.components_,4),index = [['T'+str(i) for i in range(1,11)]],columns =
terms)

In [33]:
###Document-Concept Matrix
doc_concept=pd.DataFrame(np.round(document_topics,3), columns=['T'+str(i) for i in range(1,11)])
doc_concept.head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
0,0.01,0.005,0.004,-0.006,-0.002,0.006,-0.001,0.005,0.012,0.002
1,0.109,-0.035,0.046,-0.035,-0.04,-0.037,0.01,-0.005,0.002,-0.024
2,0.018,0.018,-0.008,0.004,0.003,-0.002,-0.002,0.003,-0.008,0.0
3,0.013,0.01,-0.002,-0.002,-0.003,0.005,-0.001,0.005,0.015,-0.007
4,0.048,0.043,0.014,0.008,-0.025,-0.014,-0.001,0.005,0.026,-0.073


In [226]:
#Sorting Document-Concept Matrix by highest weighted terms in each topic
doc_concept.sort_values(by='T5',ascending=False).head()

Unnamed: 0,T1,T2,T3,T4,T5,T6,T7,T8,T9,T10
79040,0.403,0.233,-0.058,-0.263,0.627,0.217,0.257,0.029,-0.237,0.07
13065,0.403,0.233,-0.058,-0.263,0.627,0.217,0.257,0.029,-0.237,0.07
29962,0.403,0.233,-0.058,-0.263,0.627,0.217,0.257,0.029,-0.237,0.07
61467,0.437,0.126,0.21,0.259,0.572,0.312,0.131,-0.036,-0.263,0.064
33473,0.437,0.126,0.21,0.259,0.572,0.312,0.131,-0.036,-0.263,0.064


In [242]:
###TOP 5 COMMENTS FOR EACH CONCEPT
columns=list(doc_concept)
x=[doc_concept[column].nlargest(5).index.values for column in doc_concept]
for i in range(0,10):
    for j in range(0,5):
        print('Topic'+ ' ' + str(i+1)+ ' ' +ufo_l[x[i][j]])
###For Each Topic, Printing Comments with Highest Scores

Topic 1 Bright light /Object moving Across sky
Topic 1 Disk-like&#44 very bright object in the sky with lights
Topic 1 object in sky with bright light
Topic 1 Bright light moving in the sky
Topic 1 bright lights in the sky
Topic 2 moving light/box shape object
Topic 2 X shaped object.
Topic 2 Very LARGE  V shaped objects or objects
Topic 2 Star/Planet Shaped Flying Object
Topic 2 us and  the object
Topic 3 A &quot;break-up&quot; in the sky
Topic 3 Fire in the sky.
Topic 3 FIRE IN THE SKY
Topic 3 Fire in the sky
Topic 3 fire in the sky
Topic 4 2 orange fireball&#39s over Bakersfield&#44CA
Topic 4 Orange lights
Topic 4 Orange lights
Topic 4 Orange lights.
Topic 4 orange lights
Topic 5 The object  was bright and stauled.
Topic 5 VERY bright object.
Topic 5 Bright object.
Topic 5 Bright Orange Object
Topic 5 Bright orange object
Topic 6 Spanaway UFO.  ((NUFORC Note:  Possible sighting of a &quot;twinkling&quot; star?  PD))
Topic 6 A bright show of A UFO>
Topic 6 Bright orange lights hoveri

In [292]:
###Determining Highest Scored Topic for Each Document
topic_cluster=doc_concept.idxmax(axis=1)

In [293]:
###Appending Dominant Topic to Each Document in DataFrame
ufo['topic']=topic_cluster

In [47]:
###Viewing Dataset
ufo.head()

Unnamed: 0,datetime,city,state,country,shape,duration (seconds),duration (hours/min),comments,date posted,latitude,longitude,topic
0,10/10/1949 20:30,san marcos,tx,us,cylinder,2700,45 minutes,This event took place in early fall around 194...,4/27/2004,29.8830556,-97.941111,T9
1,10/10/1949 21:00,lackland afb,tx,,light,7200,1-2 hrs,1949 Lackland AFB&#44 TX. Lights racing acros...,12/16/2005,29.38421,-98.581082,T1
2,10/10/1955 17:00,chester (uk/england),,gb,circle,20,20 seconds,Green/Orange circular disc over Chester&#44 En...,1/21/2008,53.2,-2.916667,T1
3,10/10/1956 21:00,edna,tx,us,circle,20,1/2 hour,My older brother and twin sister were leaving ...,1/17/2004,28.9783333,-96.645833,T9
4,10/10/1960 20:00,kaneohe,hi,us,light,900,15 minutes,AS a Marine 1st Lt. flying an FJ4B fighter/att...,1/22/2004,21.4180556,-157.803611,T1


In [None]:
###Count of Documents that Belong to Each Topic
ufo['topic'].value_counts()

In [56]:
###Reading DataFrame with Dominant Topic Appended to Each Document to CSV
ufo.to_csv(r'/Users/benroberts/Downloads/MSA-Fall1/Python/ufo_topic.csv',index=None)