In [None]:
#importing necessery libraries for future analysis of the dataset
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

import seaborn as sns
#import nltk
#To open dialog download:
#nltk.download();
#To downlaod just stopwords:
#nltk.download('stopwords');


import nltk

import nltk.corpus
# two types of stemmers. PorterStemmer algorithm from 1979. PorterStemmer uses Suffix Stripping to produce stems. 

from nltk.stem import PorterStemmer
#PorterStemmer algorithm does not follow linguistics rather a set of 05 rules for different cases that are applied in phases
#(step by step) to generate stems. 
#This is the reason why PorterStemmer does not often generate stems that are actual English words.
#It does not keep a lookup table for actual stems of the word but applies algorithmic rules to generate stems.
#PorterStemmer is known for its simplicity and speed. 

from nltk.stem import LancasterStemmer
#The LancasterStemmer (Paice-Husk stemmer) is an iterative algorithm with rules saved externally.
#One table containing about 120 rules indexed by the last letter of a suffix. 
#On each iteration, it tries to find an applicable rule by the last character of the word. 
#Each rule specifies either a deletion or replacement of an ending. If there is no such rule, it terminates. 
#It also terminates if a word starts with a vowel and there are only two letters left
#or if a word starts with a consonant and there are only three characters left. 
#Otherwise, the rule is applied, and the process repeats.
#LancasterStemmer is simple, but heavy stemming due to iterations and over-stemming may occur. 
#Over-stemming causes the stems to be not linguistic, or they may have no meaning.



from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer
import string
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

In [None]:
#using pandas library and 'read_csv' function to read csv file 
airbnb=pd.read_csv('AB_NYC_2019.csv')
#examing head of csv file 
#airbnb.head(30)

In [None]:
#removing Stop Words such as “the”, “a”, “an”, “in”

stop_words = set(stopwords.words('english'))


#initializing empty list where we are going to put our name strings
names=[]

#getting name strings from the column and appending it to the list
for name in airbnb.name:
    names.append(name)
    
#setting a function that will split those name strings into separate words   
def split_name(name):
    spl=str(name).split()
    return spl

#initializing empty list where we are going to have words counted
names_for_count=[]

#getting name string from our list and using split function, later appending to list above
for x in names:
    for word in split_name(x):
        word=word.lower()
        names_for_count.append(word)
        

    
no_stop_words = [w for w in names_for_count if not w in stop_words]

#lemmatizing the list of keywords. 
#Lemmatization, unlike Stemming, reduces the inflected words properly ensuring that the root word belongs to the language. 
#In Lemmatization root word is called Lemma.
#A lemma (plural lemmas or lemmata) is the canonical form, dictionary form, or citation form of a set of words.
lemmatizer = WordNetLemmatizer() 
lemmatized_words = [lemmatizer.lemmatize(w) for w in no_stop_words] #, pos="v" as parameter to lemmatize


In [None]:
#getting rid of punctiations and numbers.
cleaned_and_lemmatized = []

no_punctuation = [x.translate(str.maketrans('','',string.punctuation)) for x in lemmatized_words] #unique_lemmatized_words

no_digits = [x.translate(str.maketrans('','',"0123456789")) for x in no_punctuation ]
        
for item in no_digits:
    cleaned_and_lemmatized.append(item)

In [None]:
#getting rid of empty strings

cleaned_and_lemmatized = [i for i in cleaned_and_lemmatized if i] 


type(cleaned_and_lemmatized)
cleaned_and_lemmatized





In [None]:
#we are going to use counter to count the x most used words(30).

from collections import Counter
#let's see top 30 used words by host to name their listing
top_30_w=Counter(cleaned_and_lemmatized).most_common() #unique_meaningful_words / cleaned_and_lemmatized
top_30_w=top_30_w[0:30]

In [None]:
top_30_w

In [None]:
#now let's put our findings in dataframe for further visualizations
keywordsDF=pd.DataFrame(top_30_w)
keywordsDF.rename(columns={0:'Words', 1:'Count'}, inplace=True)

In [None]:
#we are going to use barplot for this visualization
viz_5=sns.barplot(x='Words', y='Count', data=keywordsDF)
viz_5.set_title('Counts of the top 30 used words for listing names')
viz_5.set_ylabel('Count of words')
viz_5.set_xlabel('Words')
viz_5.set_xticklabels(viz_5.get_xticklabels(), rotation=80)