## Installations Required
<br>
!pip install geopandas
<br>
!pip3 install shapely==1.5.17.post1
<br>
!pip install geojsonio
<br>
!pip install langdetect
<br>
!pip install cufflinks

## Import Statements

In [None]:
import pandas as pd 
import geopandas as gpd
import geojsonio
import numpy as np
import os
import glob   
import gc
import time 
from collections import Counter
import seaborn as sns 
import matplotlib.pyplot as plt
from langdetect import detect
from wordcloud import WordCloud, STOPWORDS
#https://towardsdatascience.com/sentimental-analysis-using-vader-a3415fef766
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from tqdm import tqdm, tqdm_pandas
from bs4 import BeautifulSoup
#Plotly Tools
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.offline as offline
offline.init_notebook_mode()
from plotly import tools
import plotly.tools as tls
init_notebook_mode(connected=True)
#https://stackoverflow.com/questions/55132071/series-object-has-no-attribute-iplot/55132247
import cufflinks as cf
cf.go_offline()
cf.set_config_file(offline=False, world_readable=True)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()


## Global Variables

In [None]:
#File Location Variables
_DataFolderPath="Data"
_LocationName="Jersey_NJ"
_ListingCSV="listings"
_ReviewsCSV="reviews"
_NeighbourhoodsCSV="neighbourhoods.csv"
_CalendarCSV="calendar"
_NeighbourhoodsJson="neighbourhoods.geojson"
_LocationPath=_DataFolderPath +"/" + _LocationName


In [None]:
# Data Frames
_DF_Listing= pd.DataFrame()
_DF_Calendar=pd.DataFrame()
_DF_Reviews=pd.DataFrame()
_DF_Neighbourhoods= pd.read_csv( _LocationPath + "/" + _NeighbourhoodsCSV)
_DF_Neighbourhoods_json = gpd.read_file( _LocationPath + "/" + _NeighbourhoodsJson)

# Helper Methods

In [None]:
#https://stackoverflow.com/questions/678236/how-to-get-the-filename-without-the-extension-from-a-path-in-python
def file_base_name(file_name):
    if '.' in file_name:
        separator_index = file_name.index('.')
        base_name = file_name[:separator_index]
        return base_name
    else:
        return file_name

def path_base_name(path):
    file_name = os.path.basename(path)
    return file_base_name(file_name)

In [None]:
#https://stackoverflow.com/questions/39100971/how-do-i-release-memory-used-by-a-pandas-dataframe
def cleanDataFrame(objDf):
    del objDf
    gc.collect()
    objDf=pd.DataFrame()


In [None]:
#Method to clear Empty Spaces
def replaceSpaces(text):
    temp= str(text)
    temp=text.strip()
    temp=temp.replace('\\r', '')
    temp=temp.replace('\\"', '')
    temp=temp.replace('\\n', '')
    temp=temp.replace(' ', '_')
    return temp

In [None]:
# https://stackoverflow.com/a/47091490/4084039
import re
def decontracted(phrase):
    # specific
    phrase = re.sub(r"won't", "will not", phrase)
    phrase = re.sub(r"can\'t", "can not", phrase)
    # general
    phrase = re.sub(r"n\'t", " not", phrase)
    phrase = re.sub(r"\'re", " are", phrase)
    phrase = re.sub(r"\'s", " is", phrase)
    phrase = re.sub(r"\'d", " would", phrase)
    phrase = re.sub(r"\'ll", " will", phrase)
    phrase = re.sub(r"\'t", " not", phrase)
    phrase = re.sub(r"\'ve", " have", phrase)
    phrase = re.sub(r"\'m", " am", phrase)
    return phrase

In [None]:
#Detects Language from Given text
def detectLanguage(phrase):
    try: 
        return detect(phrase)
    except:
        return 'na'


In [None]:
#https://stackoverflow.com/questions/60214194/error-in-reading-stock-data-datetimeproperties-object-has-no-attribute-week
#https://docs.python.org/3/library/time.html
def getWeekDayNumber(text):
    return time.strptime(text, '%A').tm_wday

def getMonthNumber(text):
    return time.strptime(text, '%B').tm_mon

In [None]:
# To get the results in 4 decemal points
SAFE_DIV = 0.0001 
STOP_WORDS = stopwords.words("english")

def preprocess(x):
    x = str(x).lower()
    x = x.replace(",000,000", "m").replace(",000", "k").replace("′", "'").replace("’", "'")\
                           .replace("won't", "will not").replace("cannot", "can not").replace("can't", "can not")\
                           .replace("n't", " not").replace("what's", "what is").replace("it's", "it is")\
                           .replace("'ve", " have").replace("i'm", "i am").replace("'re", " are")\
                           .replace("he's", "he is").replace("she's", "she is").replace("'s", " own")\
                           .replace("%", " percent ").replace("₹", " rupee ").replace("$", " dollar ")\
                           .replace("€", " euro ").replace("'ll", " will")
    x = re.sub(r"([0-9]+)000000", r"\1m", x)
    x = re.sub(r"([0-9]+)000", r"\1k", x)
    
    
    porter = PorterStemmer()
    pattern = re.compile('\W')
    
    if type(x) == type(''):
        x = re.sub(pattern, ' ', x)
    
    
    if type(x) == type(''):
        x = porter.stem(x)
        example1 = BeautifulSoup(x)
        x = example1.get_text()
               
    
    return x
    

In [None]:
SENTIMENT_ANALYZER = SentimentIntensityAnalyzer()
def getSentimentScore_Compound(text):
    statement_polarity = SENTIMENT_ANALYZER.polarity_scores(text)
    return statement_polarity['compound']

def getSentimentScore_Compound(text):
    statement_polarity = SENTIMENT_ANALYZER.polarity_scores(text)
    return statement_polarity['compound']

In [None]:
%%time
#https://stackoverflow.com/questions/10377998/how-can-i-iterate-over-files-in-a-given-directory
#https://www.geeksforgeeks.org/python-os-scandir-method/
totalFilesMergeCount=0
totalCalendarsMereCount=0
totalReviewsMergeCoun=0
#Load Listing For Recent Quarter
_DF_Listing = pd.read_csv(_LocationPath + "/0/" + _ListingCSV + ".csv.gz" , compression='gzip')

#Loop to fetch Reviews and Calendars for Past 1 Year based for Listing IDs of this quarter
with os.scandir(_LocationPath) as objDir:
    for entry in objDir:
        if  entry.is_dir():
            path = entry.path + "/*.gz" 
            files=glob.glob(path)
            for file in files:
                totalFilesMergeCount=totalFilesMergeCount+1
                #print ("Files Counter : ",totalFilesMergeCount)
                fileName=path_base_name(file)
                #print(fileName)
                if fileName== _CalendarCSV:
                    #print(path)
                   # print(fileName)
                    df = pd.read_csv(file, compression='gzip')
                    _DF_Calendar=pd.concat([_DF_Calendar,df[df.listing_id.isin(_DF_Listing.id)]], ignore_index=True)
                    totalCalendarsMereCount=totalCalendarsMereCount +df[df.listing_id.isin(_DF_Listing.id)].shape[0]
                if fileName== _ReviewsCSV:
                   # print(path)
                   # print(fileName)
                    df = pd.read_csv(file, compression='gzip')
                    _DF_Reviews=pd.concat([_DF_Reviews,df[df.listing_id.isin(_DF_Listing.id)]], ignore_index=True)
                    totalReviewsMergeCoun=totalReviewsMergeCoun+df[df.listing_id.isin(_DF_Listing.id)].shape[0]
                
objDir.close() 
del df
gc.collect()

print ("Total Files Merged : " ,  totalFilesMergeCount)
print ("Total Records Inserted for Calendar : " , totalCalendarsMereCount)
print ("Total Records Inserted for Reviews : ", totalReviewsMergeCoun)

In [None]:
print ("Listings : ", _DF_Listing.shape)
print ("Calendars : ", _DF_Calendar.shape)
print ("Reviews : ", _DF_Reviews.shape)

## Getting Information About Reviews DataSet

In [None]:
_DF_Reviews.describe()

In [None]:
_DF_Reviews.info()


In [None]:
_DF_Reviews.date.max()

In [None]:
_DF_Reviews.date.min()

In [None]:
#

##########################################################################################
#Trying to find duplicate reviews for same user in same listing for same date.
#Not working
#Need help
#########################################################################################


In [None]:
x=_DF_Reviews[_DF_Reviews.duplicated(subset=['listing_id','reviewer_id','date'])] 
x.duplicated().sum()


In [None]:
x

In [None]:
#

##########################################################################################
#Trying to find duplicate reviews for same user in same listing for same date.
#Not working
#Need help
#########################################################################################

In [None]:
# Preprocess Dara for Reviews Text

In [None]:
_DF_Reviews[(_DF_Reviews['id']==608103333) & (_DF_Reviews['listing_id']==42075992)]


In [None]:
_DF_Reviews.listing_id[256353]

In [None]:
_DF_Reviews.comments[256353]

In [None]:
detectLanguage(_DF_Reviews.comments[256353])

In [None]:
_DF_Reviews.comments[10000]

In [None]:
detectLanguage('혜빈	숙소의 장점은 너무나도 많지만 몇가지 추려보자면...☺️\n✔️맨해트보다 너무너무')

In [None]:
detectLanguage('I strongly recommend you to stay this place. T..')

In [None]:
%%time
_DF_Reviews['language_type'] = _DF_Reviews['comments'].apply(detectLanguage)

In [None]:
_DF_Reviews.loc[_DF_Reviews['language_type']!='en']


In [None]:
totalLanguagesCount=pd.DataFrame(_DF_Reviews.language_type.unique()).shape[0]
print('Count/List of languages are used in comments -:', totalLanguagesCount)
_DF_Reviews.language_type.unique()

In [None]:
totalReviewsCount=_DF_Reviews.shape[0]
reviews_not_in_english=_DF_Reviews.loc[_DF_Reviews['language_type']!='en'].shape[0]
print ("Total Reviews : ", totalReviewsCount)
print ("Total Reviews Not English: ", reviews_not_in_english)
percentage_diffLanguages=(reviews_not_in_english/totalReviewsCount)*100
print ("Percenatge of Differnt Languages : ",percentage_diffLanguages,"%")


In [None]:
#We are removing comments which are non-english for processing

In [None]:
_DF_Reviews=_DF_Reviews[_DF_Reviews['language_type']=='en']

In [None]:
%%time
_DF_Reviews["comments_cleaned"] = _DF_Reviews["comments"].fillna("").apply(preprocess)

In [None]:
#Get Sentiment Scores of Reviews

In [None]:
%%time
_DF_Reviews['Review_Score']=_DF_Reviews.comments_cleaned.apply(getSentimentScore_Compound)

In [None]:
 %%time
_DF_Reviews['Review_Len']=_DF_Reviews.comments_cleaned.apply(len)


In [None]:
#Add features Related to Reviews in Listings Table

In [None]:
%%time
#Empty Arrays Declaration for  Avg Review Scores, Review Count, Avg Review Lenght
avg_review_score = []
total_reviews = []
avg_review_len=[]
previous_reviewDate=[]
latest_reviewDate=[]
from tqdm import tqdm
 #https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas 
for index, row in tqdm( _DF_Listing.iterrows()):
    temp_listingid=row['id']
    filtered_reviews=_DF_Reviews[_DF_Reviews['listing_id']==temp_listingid] 
    df_sum=filtered_reviews.sum(axis = 0, skipna = True) 
    total_score= df_sum['Review_Score']
    total_len=df_sum['Review_Len']
    record_count=filtered_reviews.shape[0]
    avg_review_score.append(total_score/record_count)
    total_reviews.append(record_count)
    avg_review_len.append(total_len/record_count)
    latest_reviewDate.append(filtered_reviews.date.max())
    previous_reviewDate.append(filtered_reviews.date.min())

_DF_Listing['avg_review_score'] =avg_review_score
_DF_Listing['total_reviews_count']=total_reviews
_DF_Listing['avg_review_len'] =avg_review_len
_DF_Listing['past_review_date'] =previous_reviewDate
_DF_Listing['laste_review_date'] =latest_reviewDate

# Getting Information About Calendars

In [None]:
_DF_Calendar.info()

In [None]:
_DF_Calendar['price'].replace(regex=True, inplace=True, to_replace=r'[^0-9.\-]',value=r'')
_DF_Calendar['price'] = _DF_Calendar['price'].astype(float)
_DF_Calendar['adjusted_price'].replace(regex=True, inplace=True, to_replace=r'[^0-9.\-]',value=r'')
_DF_Calendar['adjusted_price'] = _DF_Calendar['adjusted_price'].astype(float)
#_DF_Calendar['available'] = _DF_Calendar['available'].astype(bool)

In [None]:
_DF_Calendar.describe()

In [None]:
print('We have', _DF_Calendar.date.nunique(), 'days and', _DF_Calendar.listing_id.nunique(), 'unique listings in the calendar data.')

In [None]:
_DF_Calendar['date'] = pd.to_datetime(_DF_Calendar['date'])

In [None]:
_DF_Calendar.date.min(), _DF_Calendar.date.max()

In [None]:
_DF_Calendar.available.value_counts()

In [None]:
_DF_Calendar[_DF_Calendar.available=='t']

In [None]:
_DF_Calendar['available'] = _DF_Calendar.available.map(lambda x: 1 if x == 't' else 0)

In [None]:
_DF_Calendar.available.value_counts()

In [None]:
 _DF_Calendar[_DF_Calendar.listing_id==917065]

In [None]:
_DF_Calendar['day_of_week'] = _DF_Calendar.date.dt.day_name()
_DF_Calendar['month'] = _DF_Calendar.date.dt.month_name()


In [None]:
_DF_Calendar['week_day_num']=_DF_Calendar['day_of_week'] .apply(getWeekDayNumber)
_DF_Calendar['month_num']=_DF_Calendar['month'] .apply(getMonthNumber)

In [None]:
tempCalendar_g=_DF_Calendar.groupby('week_day_num')['available', 'price','adjusted_price','minimum_nights','maximum_nights'].mean().reset_index().rename(columns={'available':'avg_availablilty_rate','price' : 'avg_price','adjusted_price' : 'avg_adjusted_price','minimum_nights' :'avg_minimum_nights','maximum_nights':'avg_maximum_nights'})

In [None]:
tempCalendar_g

In [None]:
_DF_Listing


In [None]:
def getAVR(val):
    return (1-val)*100

In [None]:
#https://stackoverflow.com/questions/2468334/python-how-to-create-dynamic-and-expandable-dictionaries
#https://stackoverflow.com/questions/28218698/how-to-iterate-over-columns-of-pandas-dataframe-to-run-regression
def updateDicForCalendarsListings(df_cal,groupByName):
    colKeyName='month_num' if groupByName== 'm' else 'week_day_num'
    keyRange= 12 if groupByName== 'm' else 7
    dict_listings={}
    monthCounter=0
    for i in range(0,keyRange):
        for name, values in df_cal.iteritems():
            keyname=''
            value=0
            if name==colKeyName:
                monthCounter=values[i]
            else:
                if groupByName=='m':
                    keyName= calendar.month_abbr[monthCounter].lower() + '_' + name 
                else:
                    keyName= calendar.day_abbr[monthCounter].lower() + '_' + name
                value=values[i].astype(float)
                value=round(value,2)
                if keyName in dict_listings.keys():
                    dict_listings[keyName].append(value)
                else:
                    dict_listings[keyName]=[]
                    dict_listings[keyName].append(value)
    return dict_listings

In [None]:
%%time
import calendar
df_new= pd.DataFrame()
from tqdm import tqdm
for index, row in _DF_Listing.iterrows():
    temp_listingid=row['id']
    #Filter Calnders based on Listing ID
    filtered_calendar=_DF_Calendar[_DF_Calendar['listing_id']==temp_listingid] 
   # print('filtered_calendar.shape')
    #print(filtered_calendar.shape)
    if filtered_calendar.shape[0] >0:
        #Group by Weeks
        tempCalendar_weekely=filtered_calendar.groupby('week_day_num')['available', 'price','adjusted_price','minimum_nights','maximum_nights'].mean().reset_index().rename(columns={'available':'avg_availablilty_rate','price' : 'avg_price','adjusted_price' : 'avg_adjusted_price','minimum_nights' :'avg_minimum_nights','maximum_nights':'avg_maximum_nights'})
        tempCalendar_weekely['avg_availablilty_rate']= tempCalendar_weekely['avg_availablilty_rate'].apply(getAVR)
        #Group by Months
        tempCalendar_monthly=filtered_calendar.groupby('month_num')['available', 'price','adjusted_price','minimum_nights','maximum_nights'].mean().reset_index().rename(columns={'available':'avg_availablilty_rate','price' : 'avg_price','adjusted_price' : 'avg_adjusted_price','minimum_nights' :'avg_minimum_nights','maximum_nights':'avg_maximum_nights'}) 
        tempCalendar_weekely['avg_availablilty_rate']= tempCalendar_weekely['avg_availablilty_rate'].apply(getAVR)

        #print('temp_listingid')
       # print(temp_listingid)
       # print('week')
        #print(tempCalendar_weekely.shape)
        dict_week=updateDicForCalendarsListings(tempCalendar_weekely,'w')
       # print('month')
        dict_month=updateDicForCalendarsListings(tempCalendar_monthly,'m')
        dict_merge={**dict_week, **dict_month}
        dict_merge['id']=[]
        dict_merge['id'].append(temp_listingid)
        #df_new= df_new.replace(dict_merge, regex=True, inplace=True)
        df_new=df_new.append(dict_merge,ignore_index=True)
        dict_merge.clear()
    

In [None]:
#https://stackoverflow.com/questions/65465625/remove-square-brackets-from-all-rows-in-data-frame
df_new = df_new.applymap(lambda x : x[0])

In [None]:
df_new

In [None]:
#https://www.shanelynn.ie/merge-join-dataframes-python-pandas-index-1/
_DF_Listing = pd.merge(_DF_Listing,
                df_new,
                 on='id',how='left')

## Getting Information About NeighbourHood

In [None]:
_DF_Neighbourhoods.head()


In [None]:
_DF_Neighbourhoods.describe()

In [None]:
_DF_Neighbourhoods.info()

In [None]:
print(_DF_Neighbourhoods_json.head())

In [None]:
contents = open(_LocationPath + "/" + _NeighbourhoodsJson).read()
geojsonio.display(contents)

# EDA For Listings

In [None]:
_DF_Listing.info(verbose=True, null_counts=True)

In [None]:
#We don't need following columns:
'''
1. listing_url -" URL for Every Listing we dont need this"
2. scrape_id -: Related toData Collection
3. last_scraped =" Lasy date of data collected"
4. name of property
5. picture_url
6.host_url                                      
7. c 
8.host_thumbnail_url
'''

In [None]:
_DF_Listing_EDA=_DF_Listing.drop(columns=['listing_url', 'scrape_id','last_scraped','name','picture_url','host_url','host_url','host_thumbnail_url'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
#https://www.kaggle.com/mistrzuniu1/tutorial-eda-feature-selection-regression
total = _DF_Listing_EDA.isnull().sum().sort_values(ascending = False)
percent = (_DF_Listing_EDA.isnull().sum()/_DF_Listing_EDA.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
pd.set_option('display.max_rows', 500)
missing_data


In [None]:
del missing_data

In [None]:
#Dropping Columns which are 100 % Null

In [None]:
_DF_Listing_EDA=_DF_Listing_EDA.drop(columns=['neighbourhood_group_cleansed', 'bathrooms','calendar_updated','license'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
#Text Preprocessing and NLP for host_about

In [None]:
_DF_Listing_EDA.fillna({'host_about':'na'}, inplace=True)

In [None]:
_DF_Listing_EDA.host_about

In [None]:
%%time
_DF_Listing_EDA["host_about"] = _DF_Listing_EDA["host_about"].fillna("na").apply(preprocess)


In [None]:
_DF_Listing_EDA.host_about

In [None]:

%%time
_DF_Listing_EDA['host_about_score']=_DF_Listing_EDA.host_about.apply(getSentimentScore_Compound)


In [None]:
_DF_Listing_EDA['host_about_score']

In [None]:
 %%time
_DF_Listing_EDA['host_about_len']=_DF_Listing_EDA.host_about.apply(len)

In [None]:
_DF_Listing_EDA['host_about_len']

In [None]:

_DF_Listing_EDA=_DF_Listing_EDA.drop(columns=['host_about'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
# NLP and Text Processing of  host_neighbourhood

In [None]:
%%time
_DF_Listing_EDA["host_neighbourhoodd"] = _DF_Listing_EDA["host_neighbourhood"].fillna("").apply(preprocess)


In [None]:
_DF_Listing_EDA["host_neighbourhoodd"].unique()

In [None]:
#https://stackoverflow.com/questions/60102928/pandas-fillna-only-numeric-int-or-float-columns
numeric_columns = _DF_Listing_EDA.select_dtypes(include=['number']).columns


In [None]:
#fill Null Value with 0 or na
_DF_Listing_EDA.fillna({'reviews_per_month':0}, inplace=True)
_DF_Listing_EDA.fillna({'neighborhood_overview':'na'}, inplace=True)
_DF_Listing_EDA.fillna({'neighbourhood':'na'}, inplace=True)

In [None]:
# fill 0 to all NaN 
_DF_Listing_EDA[numeric_columns] = _DF_Listing_EDA[numeric_columns].fillna(0)

In [None]:
_DF_Listing_EDA.fillna('na')

In [None]:
#https://www.kaggle.com/mistrzuniu1/tutorial-eda-feature-selection-regression
total = _DF_Listing_EDA.isnull().sum().sort_values(ascending = False)
percent = (_DF_Listing_EDA.isnull().sum()/_DF_Listing_EDA.isnull().count()*100).sort_values(ascending = False)
missing_data  = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
pd.set_option('display.max_rows', 500)
missing_data

In [None]:
del missing_data

In [None]:
_DF_Listing_EDA.info(verbose=True)

In [None]:
_DF_Listing_EDA[_DF_Listing_EDA.id <=0]

In [None]:
#convert price to Float and Remove Special Characters

In [None]:
_DF_Listing_EDA['price'].replace(regex=True, inplace=True, to_replace=r'[^0-9.\-]',value=r'')
_DF_Listing_EDA['price'] = _DF_Listing_EDA['price'].astype(float)

In [None]:
#Find if Price <=0 we will remove if price is <=0

In [None]:
_DF_Listing_EDA[_DF_Listing_EDA.price <=0]

# Target Feature

## Price

In [None]:
_DF_Listing_EDA['price'].describe()



In [None]:
_DF_Listing_EDA['price'].median()


In [None]:
sns.distplot(_DF_Listing_EDA['price'])

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(_DF_Listing_EDA['price'], kde=False);


# Will Do Univariate and Bi-Univariate Ananlysis for Each Column

## Features-:

### 1. Description

In [None]:
_DF_Listing_EDA['description']

In [None]:
_DF_Listing_EDA["description"] = _DF_Listing_EDA["description"].fillna("").apply(preprocess)
description_corpus= _DF_Listing_EDA['description'].values


In [None]:
#https://www.geeksforgeeks.org/tf-idf-model-for-page-ranking/
#https://stackoverflow.com/questions/55547506/how-to-calculate-tfidf-score-from-a-column-of-dataframe-and-extract-words-with-a

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf_vec = TfidfVectorizer(stop_words='english')
desc_tfidf=tfidf_vec.fit_transform(description_corpus)

In [None]:
df_desc_scores = pd.DataFrame(desc_tfidf.toarray(), columns=tfidf_vec.get_feature_names())

In [None]:
tokens_above_threshold = df_desc_scores.max()[df_desc_scores.max() > 0.3].sort_values(ascending=False)


In [None]:
#https://stackoverflow.com/questions/26097916/convert-pandas-series-to-dataframe
df_desc_scores=pd.DataFrame({'features':tokens_above_threshold.index, 'score':tokens_above_threshold.values})


In [None]:
df_desc_scores
    

### 2.   Neighborhood

#### 2 .1  Neighborhood (Categories)

In [None]:
# Clean neighbourhood Categories

In [None]:
neighbourhood_values = list(_DF_Listing_EDA['neighbourhood'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python

neighbourhood_list = []
for i in neighbourhood_values:
    temp = ""
    # consider we have text like this "NY,NJ,MH"
    if (str(i) != 'nan'):
        #print(i)   
        for j in i.split(','): # it will split it in three parts ["NY", "NJ", "MH"]
            j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) 
            temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
            temp = temp.replace('&','_')
            temp = temp.replace('UnitedStates','USA')
            temp = temp.replace('NewJersey','NJ')
            temp = temp.replace('NewJersey','NJ')
        neighbourhood_list.append(temp.strip())

_DF_Listing_EDA['neighbourhood'] = neighbourhood_list

# count of all the words in corpus python: https://stackoverflow.com/a/22898595/4084039
my_counter = Counter()
for word in _DF_Listing_EDA['neighbourhood'].values:
    my_counter.update(word.split())
    
neighbourhood_dict = dict(my_counter)
sorted_neighbourhood_dict = dict(sorted(neighbourhood_dict.items(), key=lambda kv: kv[1]))

#### 2.2   Neighborhood_overview      

In [None]:
#Doing the Folloing for neighborhood_overview
# 1.Text Preprocessing
# 2. Calculating Sentiment Scores
# 3. Calc Lenght of of Review
#T4. Calc Word Count in Review

In [None]:
%%time
_DF_Listing_EDA['neighborhood_overview'] = _DF_Listing_EDA["neighborhood_overview"].fillna("").apply(preprocess)
_DF_Listing_EDA['neighborhood_overview_score']=_DF_Listing_EDA.neighborhood_overview.apply(getSentimentScore_Compound)
_DF_Listing_EDA['neighborhood_overview_len']=_DF_Listing_EDA.neighborhood_overview.apply(len)
_DF_Listing_EDA['neighborhood_overview_word_count'] = _DF_Listing_EDA['neighborhood_overview'].apply(lambda x: len(str(x).split()))
_DF_Listing_EDA.drop(['neighborhood_overview'], axis=1, inplace=True)

In [None]:

_DF_Listing_EDA_nhood=_DF_Listing_EDA[['neighbourhood','price','neighborhood_overview_score','neighborhood_overview_len','neighborhood_overview_word_count','neighbourhood_cleansed','longitude','latitude']]

_DF_Listing_EDA_nhood[['price','neighborhood_overview_score','neighborhood_overview_len','neighborhood_overview_word_count']] = scaler.fit_transform(_DF_Listing_EDA_nhood[['price','neighborhood_overview_score','neighborhood_overview_len','neighborhood_overview_word_count']])

In [None]:
# Analysis of Three features added for neighbourhood_overview
    # neighborhood_overview_score
    #neighborhood_overview_len
    #neighborhood_overview_word_count

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price'].mean().iplot(kind='bar',  xTitle='Neighbourhood', yTitle='Average Price')
                             

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood_cleansed')['price'].mean().iplot(kind='bar',  xTitle='Neighbourhood', yTitle='Average Price')


In [None]:
_DF_Listing_EDA_nhood[['neighbourhood', 'price']].pivot(columns='neighbourhood', values='price').iplot(kind='box')


In [None]:
_DF_Listing_EDA_nhood[['neighbourhood_cleansed', 'price']].pivot(columns='neighbourhood_cleansed', values='price').iplot(kind='box')


In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','neighborhood_overview_score'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','neighborhood_overview_score']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','neighborhood_overview_score',
                                                                  'neighborhood_overview_len','neighborhood_overview_word_count'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','neighborhood_overview_score','neighborhood_overview_len','neighborhood_overview_word_count']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','neighborhood_overview_len','neighborhood_overview_word_count'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','neighborhood_overview_len','neighborhood_overview_word_count']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','neighborhood_overview_score',
                                                                  'neighborhood_overview_len','neighborhood_overview_word_count'].mean().corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")

In [None]:
_DF_Listing_EDA_nhood[['price','neighbourhood','neighbourhood_cleansed','longitude','latitude']].corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")

In [None]:
_DF_Listing_EDA_nhood

### Observation:
1. Maximum Listings are in Jersy City, NJ
2. Highest Avg Price is in Hoboke : 207 USD
3. AvgMinimm price is in jersy City, NJ: 40 USD
4. Hoboken has very few listings as compare to  Jersy City
5. Union City has Maximum Neighbourhood overview scores 
6. Jersy City,Hoboen,Bayonne Has Smillar neighbour Hood Scores 
7. Newyork has negtaive overviews with average price of 127 USD
8. Over View Length, Overview Word Count are directly related to Avg price of Neighbour Hood
10. Over Scores has no impact of score. Will rmove this coulmn.

In [None]:
del _DF_Listing_EDA_nhood

### 3. Review Scores of Listings :

Will do Price Ananlysis and Multivarite Analysis with following features:

    # avg_review_score                              
 
    # total_reviews_count                             
 
    # avg_review_len                                
 
    # past_review_date                               
 
    # laste_review_date                              

In [None]:
_DF_Listing_EDA['past_review_date'] = pd.to_datetime(_DF_Listing_EDA['past_review_date'])
_DF_Listing_EDA['laste_review_date'] = pd.to_datetime(_DF_Listing_EDA['laste_review_date'])

In [None]:
#https://stackoverflow.com/questions/37840812/pandas-subtracting-two-date-columns-and-the-result-being-an-integer/46966942

_DF_Listing_EDA['ReviewDate_Diff'] = (_DF_Listing_EDA['laste_review_date']-_DF_Listing_EDA['past_review_date']).dt.days

In [None]:
_DF_Listing_EDA.fillna({'ReviewDate_Diff':0}, inplace=True)

In [None]:
# Min Max Scaler

In [None]:
_DF_Listing_EDA_nhood= _DF_Listing_EDA[['neighbourhood','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','host_id','price']]

In [None]:

_DF_Listing_EDA_nhood[['avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','price']]= scaler.fit_transform(_DF_Listing_EDA_nhood[['avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','price']])


### Listing Reviews Based on Neighbour Hoods

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','avg_review_score'].mean().iplot()

In [None]:
_DF_Listing_EDA.groupby('neighbourhood')['price','avg_review_score'].mean().iplot(kind='box')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','total_reviews_count'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','total_reviews_count']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','avg_review_len'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','avg_review_len']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','ReviewDate_Diff'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood[['price','ReviewDate_Diff']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff'].mean().iplot()

In [None]:
_DF_Listing_EDA_nhood.groupby('neighbourhood')['price','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff'].mean().corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")

In [None]:
del  _DF_Listing_EDA_nhood

####  Observations:

1. Listings in Jersy City and Newyork Locations has Maximum Review  Scores
2.  Listings in Hobokens has very less Review Scores
3. All the listings hasve +ve review scores from 0.06 to 0.08
4. Avergage Listing Scores has no impact on Neighbout Hood Listings 
5. Review Count and Review Date Diff has very much impact of Price of listings in Neighbour hood.

#### Review Scores Based On Every Listing

In [None]:
_DF_Listing_EDA.describe()[['avg_review_score']]

In [None]:
df_Listing_EDA_reviews= _DF_Listing_EDA[['id','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','host_id','price']]


In [None]:
df_Listing_EDA_reviews[['avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','price']] = scaler.fit_transform(df_Listing_EDA_reviews[['avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff','price']])

In [None]:
_DF_Listing_EDA['price'].iplot(kind='hist', bins=100,  xTitle='Listing Price in $', yTitle='Count')
                             

In [None]:
df_Listing_EDA_reviews.groupby('id')['avg_review_score','price'].mean().iplot()

In [None]:
df_Listing_EDA_reviews.groupby('id')['price','total_reviews_count'].mean().iplot()

In [None]:
df_Listing_EDA_reviews.groupby('id')['price','avg_review_len'].mean().iplot()

In [None]:
df_Listing_EDA_reviews.groupby('id')['price','ReviewDate_Diff'].mean().iplot()

In [None]:
df_Listing_EDA_reviews.groupby('id')['price','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff'].mean().iplot()

In [None]:
df_Listing_EDA_reviews.groupby('id')['price','avg_review_score','total_reviews_count','avg_review_len','ReviewDate_Diff'].mean().corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")

In [None]:
del df_Listing_EDA_reviews

#### Observation:


1. We took the top 100 and bottom 100 review points based on listings
2. Pirce of Listings is very much corelated with Review Count and Review Date Difference
3. Avg review score has very less impact on Price

### 3.   HOST
      host_about_score                              
      host_about_len                                  
      host_neighbourhoodd  
      host_location                                  
      host_response_time                             
      host_response_rate                             
      host_acceptance_rate                           
      host_is_superhost                              
      host_picture_url                               
      host_neighbourhood                             
      host_listings_count                             
      host_total_listings_count                       
      host_verifications                             
      host_has_profile_pic                           
      host_identity_verified  
      host_since 

In [None]:
_DF_Listing_EDA['host_location']=_DF_Listing_EDA["host_location"].fillna('na').apply(preprocess)

In [None]:
_DF_Listing_EDA['host_response_time']=_DF_Listing_EDA["host_response_time"].fillna('na').apply(preprocess)

In [None]:
_DF_Listing_EDA['host_response_rate']=_DF_Listing_EDA["host_response_rate"].fillna('na').apply(preprocess)

In [None]:
_DF_Listing_EDA['host_neighbourhood']=_DF_Listing_EDA["host_neighbourhood"].fillna('na').apply(preprocess)

In [None]:
_DF_Listing_EDA['host_acceptance_rate']= _DF_Listing_EDA['host_acceptance_rate'].str.replace('%', '')
_DF_Listing_EDA['host_acceptance_rate']=_DF_Listing_EDA['host_acceptance_rate'].astype('float')


In [None]:
_DF_Listing_EDA['host_acceptance_rate']=_DF_Listing_EDA["host_acceptance_rate"].fillna(0)

In [None]:
_DF_Listing_EDA['host_acceptance_rate']

In [None]:
_DF_Listing_EDA['host_response_time'].unique()

In [None]:
 _DF_Listing_EDA['host_response_rate']= _DF_Listing_EDA['host_response_rate'].str.replace('na', '0')   
_DF_Listing_EDA['host_response_rate']= _DF_Listing_EDA['host_response_rate'].str.replace('percent', '')
_DF_Listing_EDA['host_response_rate']=_DF_Listing_EDA['host_response_rate'].astype('float')


In [None]:
_DF_Listing_EDA['host_is_superhost'] = _DF_Listing_EDA.host_is_superhost.map(lambda x: 1 if x == 't' else 0)

In [None]:
_DF_Listing_EDA['Has_Profile_Pic'] = _DF_Listing_EDA.host_picture_url.map(lambda x: 1 if len (x) > 0 else 0)

In [None]:
_DF_Listing_EDA['host_identity_verified'] = _DF_Listing_EDA.host_identity_verified.map(lambda x: 1 if x == 't' else 0)

In [None]:
_DF_Listing_EDA['Has_Profile_Pic']

In [None]:
_DF_Listing_EDA=_DF_Listing_EDA.drop(columns=['host_picture_url'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
_DF_Listing_EDA['host_verifications_types']=_DF_Listing_EDA['host_verifications'].apply(lambda x: x.count(','))

In [None]:
_DF_Listing_EDA['host_verifications_types']

In [None]:
_DF_Listing_EDA=_DF_Listing_EDA.drop(columns=['host_verifications'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
_DF_Listing_EDA["host_Since"] = pd.to_datetime(_DF_Listing_EDA["host_since"])

In [None]:
_DF_Listing_EDA["host_Since"]

In [None]:
#https://stackoverflow.com/questions/57011334/how-to-find-number-of-days-between-today-and-future-date/57013179
_DF_Listing_EDA['host_age'] = ( pd.Timestamp('now')-_DF_Listing_EDA['host_Since']).dt.days

In [None]:
_DF_Listing_EDA['host_age']

In [None]:
_DF_Listing_EDA=_DF_Listing_EDA.drop(columns=['host_Since'])
_DF_Listing_EDA=_DF_Listing_EDA.reset_index(drop=True)

In [None]:
df_list_host=_DF_Listing_EDA[['host_id','host_name','host_about_score','host_about_len',
                 'host_neighbourhoodd','host_location',
                 'host_response_time','host_response_rate','host_acceptance_rate','host_is_superhost',
                 'host_total_listings_count',
                 'Has_Profile_Pic','host_has_profile_pic','host_verifications_types','host_age','price','id']]


In [None]:
df_list_host.info()

In [None]:
df_list_host[['host_about_score','host_about_len','host_response_rate','host_acceptance_rate','host_total_listings_count','host_verifications_types','host_age','price']] = scaler.fit_transform(df_list_host[['host_about_score','host_about_len','host_response_rate','host_acceptance_rate','host_total_listings_count','host_verifications_types','host_age','price']])

In [None]:
df_list_host.groupby('host_neighbourhoodd')['price'].mean().iplot(
    kind='bar',
    xTitle='Host Neighbourhood',
    linecolor='black',
    yTitle='Avg Price',
    title='Host Neighbour Hood Vs Price')

In [None]:
df_list_host.groupby('host_location')['price'].mean().iplot(
    kind='bar',
    xTitle='Host Location',
    linecolor='black',
    yTitle='Price',
    title='Host Location vs Price')

In [None]:
df_list_host.groupby('host_name')['host_about_score','price'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_about_score']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby('host_name')['host_about_len','price'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_about_len']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby('host_name')['host_about_score','host_about_len','price'].mean().iplot(
    )

In [None]:
df_list_host.groupby('host_name')['price','host_response_rate'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_response_rate']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby('host_name')['price','host_acceptance_rate'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_acceptance_rate']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:

df_list_host['neighbourhood']= _DF_Listing_EDA['neighbourhood']

In [None]:
df_list_host.groupby('host_name')['price','host_total_listings_count'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_total_listings_count']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby('host_name')['price','host_age'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_age']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby('host_name')['price','host_verifications_types'].mean().iplot(
    )

In [None]:
df_list_host[['price','host_verifications_types']].iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby(['host_is_superhost','neighbourhood'])['price'].mean().iplot()

In [None]:
df_list_host.groupby(['host_is_superhost','neighbourhood'])['price'].mean().iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby(['host_has_profile_pic','neighbourhood'])['price'].mean().iplot()

In [None]:
df_list_host.groupby(['host_has_profile_pic','neighbourhood'])['price'].mean().iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host.groupby(['Has_Profile_Pic','neighbourhood'])['price'].mean().iplot()

In [None]:
df_list_host.groupby(['Has_Profile_Pic','neighbourhood'])['price'].mean().iplot(kind = 'scatter' , mode = 'markers')

In [None]:
df_list_host[['host_about_score','host_about_len','host_response_rate','host_acceptance_rate','host_total_listings_count','host_verifications_types','host_age','Has_Profile_Pic','host_has_profile_pic','host_is_superhost','price']].corr().iplot(kind='heatmap',colorscale="Blues",title="Feature Correlation Matrix")

In [None]:
del df_list_host

In [None]:
_DF_Listing_EDA.neighbourhood_cleansed

In [None]:
#Text Preprocessing for Room Types
_DF_Listing_EDA.room_type.unique()

In [None]:

#Empty Arrays Declaration for  room_type
preprocessed_room_type = []
from tqdm import tqdm
 #https://stackoverflow.com/questions/16476924/how-to-iterate-over-rows-in-a-dataframe-in-pandas 
for index, row in tqdm( _DF_Listing_EDA.iterrows()):
    temprtype=replaceSpaces(row['room_type'])
    preprocessed_room_type.append(temprtype.lower().strip())

_DF_Listing_EDA['clean_room_type'] = preprocessed_room_type
_DF_Listing_EDA.drop(['room_type'], axis=1, inplace=True)


In [None]:
_DF_Listing_EDA.clean_neighbourhood.unique()

In [None]:
_DF_Listing_EDA.clean_room_type.unique()

In [None]:
_DF_Listing_EDA.amenities.unique()

In [None]:
amenities_values = list(_DF_Listing_EDA['amenities'].values)
# remove special characters from list of strings python: https://stackoverflow.com/a/47301924/4084039

# https://www.geeksforgeeks.org/removing-stop-words-nltk-python/
# https://stackoverflow.com/questions/23669024/how-to-strip-a-specific-word-from-a-string
# https://stackoverflow.com/questions/8270092/remove-all-whitespace-in-a-string-in-python

amenities_list = []
for i in amenities_values:
    temp = ""
    # consider we have text like this "NY,NJ,MH"
    if (str(i) != 'nan'):
        #print(i)   
        for j in i.split(','): # it will split it in three parts ["NY", "NJ", "MH"]
            j = j.replace(' ','') # we are placeing all the ' '(space) with ''(empty) 
            temp +=j.strip()+" "#" abc ".strip() will return "abc", remove the trailing spaces
            temp = temp.replace('&','_')
        amenities_list.append(temp.strip())

_DF_Listing_EDA['clean_amenities'] = amenities_list
#_DF_Listing_EDA.drop(['amenities'], axis=1, inplace=True)


In [None]:

# count of all the words in corpus python: https://stackoverflow.com/a/22898595/4084039
my_counter = Counter()
for word in _DF_Listing_EDA['clean_amenities'].values:
    my_counter.update(word.split())
    
amenities_dict = dict(my_counter)
sorted_amenities_dict = dict(sorted(amenities_dict.items(), key=lambda kv: kv[1]))

In [None]:
amenities_values

In [None]:
sorted_amenities_dict

In [None]:
amenities_dict

In [None]:
_DF_Listing_EDA.clean_amenities.unique()

In [None]:
_DF_Listing_EDA['neighborhood_overview']

In [None]:
_DF_Listing_EDA['property_type'] = _DF_Listing_EDA["property_type"].fillna("").apply(preprocess)
_DF_Listing_EDA.property_type

In [None]:
_DF_Listing_EDA.head()

# Will Do Analysis of Price with Each Feature

In [None]:
_DF_Listing_EDA.hist(bins=50, figsize=(30,20));

## 1. Room Type  AND PROPERTY TYPE Analysis for Price

In [None]:
_DF_Listing_EDA.property_type.unique()

In [None]:
_DF_Listing_EDA.clean_room_type.unique()

In [None]:
from matplotlib.pyplot import hist

hist(_DF_Listing_EDA.clean_room_type, weights=_DF_Listing_EDA.price)

In [None]:
sns.countplot(_DF_Listing_EDA['clean_room_type'], palette="plasma")
fig = plt.gcf()
fig.set_size_inches(10,10)
plt.title('Room Trype')

In [None]:
df_room_groups= _DF_Listing_EDA.groupby('clean_room_type')

In [None]:
df_prices_by_room=df_room_groups.agg(mean_Price=('price',np.mean),max_price=('price',np.max),min_price=('price',np.min),medain_price=('price',np.median))

In [None]:
df_prices_by_room

In [None]:
df_prices_by_room.plot(kind="bar",figsize = (12,7))

In [None]:
sns.boxplot(x= _DF_Listing_EDA['price'])

In [None]:
from matplotlib import pyplot
title = 'Avg Price per Room Type'
result = _DF_Listing_EDA.groupby(["clean_room_type"])['price'].aggregate(np.mean).reset_index().sort_values('price')

a4_dims = (11, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(x='clean_room_type', y="price", ax=ax,
            data=_DF_Listing_EDA, order=result['clean_room_type'])
plt.title(title)
plt.ioff()

In [None]:
_DF_Listing_EDA[_DF_Listing_EDA.price>=2000].shape

In [None]:
_DF_Listing_EDA[_DF_Listing_EDA.price>=1350].shape

In [None]:
_DF_Listing_EDA[_DF_Listing_EDA.price>=239].shape

In [None]:
import matplotlib.pyplot as plt
plt.figure(figsize=(15, 6))
sns.boxplot(x="clean_room_type", y="price", data= _DF_Listing_EDA)

In [None]:
propertytype_DF = _DF_Listing_EDA.groupby('property_type').id.count()

In [None]:
propertytype_DF.plot(kind = 'barh' , figsize = (12,7))

In [None]:
sort_price = _DF_Listing_EDA\
                    .groupby('property_type')['price']\
                    .median()\
                    .sort_values(ascending=False)\
                    .index
sns.boxplot(y='price', x='property_type', data=_DF_Listing_EDA, order=sort_price)
ax = plt.gca()
ax.set_xticklabels(ax.get_xticklabels(), rotation=45, ha='right')
plt.figure(figsize=(100, 50))
plt.show();

In [None]:
plt.figure(figsize=(15, 6))
sns.boxplot(x="property_type", y="price", data= _DF_Listing_EDA)

In [None]:
roomProperty_DF = _DF_Listing_EDA.groupby(['property_type','clean_room_type']).price.mean()

In [None]:
roomProperty_DF

In [None]:
roomProperty_DF.plot(kind='barh',figsize = (12,7))

<b>Observation : (Room Type)</b> </br>
1. There are four types of room availabe. </br>
2. Maximum listings are available for entire_home/apt. </br>
3. Listings with private_room are almost half of the entire_home/apt. </br>
4. Listing with htel room and share room are very less and almost similar in count. </br>
5. Max Price USD 2000 is for shared room and Entire home looks like an oulier  </br>
6. Avg price for for Entire Home is 175 whereas Max is 2000. </br>
7. Avg Price for hotel room is 89  Max is 239. </br>
8. Avg Price for Private Room is 68 and Max is 1350. 1350 also looks like outlier </br>
9. Shared rooms are more costly


# 2 .Neighbour Hood Analysis for Price

In [None]:
_DF_Listing_EDA.clean_neighbourhood.unique()

In [None]:
nType_DF=_DF_Listing_EDA.groupby('clean_neighbourhood').id.count()
nType_DF.plot(kind = 'barh' , figsize = (12,7))

In [None]:
df_prices_by_neighbourhood=_DF_Listing_EDA.groupby('clean_neighbourhood').agg(mean_Price=('price',np.mean),max_price=('price',np.max)
                                                   ,min_price=
                                     ('price',np.min),medain_price=('price',np.median))

In [None]:
df_prices_by_neighbourhood.plot(kind = 'barh' , figsize = (12,7))

In [None]:
#Neighbour Hood Based Overview and Count

'''
_DF_Listing['avg_review_score'] =avg_review_score
_DF_Listing['total_reviews_count']=total_reviews
_DF_Listing['avg_review_len'] =avg_review_len
_DF_Listing['past_review_date'] =previous_reviewDate
_DF_Listing['laste_review_date'] =latest_reviewDate
'''

In [None]:
_DF_Listing_EDA.columns

In [None]:
df_neighbouthood_overviews= _DF_Listing_EDA.groupby('clean_neighbourhood')['neighborhood_overview_score'].mean()

In [None]:
df_neighbouthood_overviews.plot(kind = 'barh' , figsize = (12,7))

In [None]:
#df_index = list(_DF_Listing_EDA['clean_neighbourhood'].unique())
grouped_df = _DF_Listing_EDA.groupby(
'clean_neighbourhood')['neighborhood_overview_score','price'].mean().reset_index()


In [None]:
grouped_df

In [None]:
import matplotlib.pyplot as plt
plt.scatter(_DF_Listing_EDA['neighborhood_overview_score'], _DF_Listing_EDA['price'])
plt.show() # Depending on whether you use IPython or interactive mode, etc.

In [None]:
plt.figure(figsize=(30, 15))
sns.boxplot(x="clean_neighbourhood", y="reviews_per_month", data= _DF_Listing_Filtered)

In [None]:
plt.figure(figsize=(30, 15))
sns.boxplot(x="clean_neighbourhood", y="calculated_host_listings_count", data= _DF_Listing_Filtered)

<b>Observation: </b> </br>

1. Maximum Listings are in Neighbour hood of Jersey City, New Jersey
2. Listings around New York, UnitedStates are very less
3. Second Highest count of listinsg has not mentioned neighbour hood.
4. Maxminum Avg prices are near NY and NJ
5. Maximum Reviews are found in Jersey City, New Jersey
6. Hoboken NJ  is most costly Neighbour hood
7. NorthBergen NJ has very less prices than oher areas

    

# Analysis Based on Hosts

# Top Hosts with maximum listings 

In [None]:
max_host_20=_DF_Listing_Filtered.host_id.value_counts().head(20)
max_host_20

In [None]:
#Maximum listings hosted by a single host are 117.
#Lets Verify them with  host listing count columns

In [None]:
fig, ax = plt.subplots()
_DF_Listing_Filtered.host_id.value_counts()[:30].plot(kind='bar',x='Host ID', y='Count',ax=ax, legend=True)

In [None]:
top_host_df=pd.DataFrame(max_host_20)
top_host_df.reset_index(inplace=True)
top_host_df.rename(columns={'index':'Host_ID', 'host_id':'P_Count'}, inplace=True)
top_host_df

In [None]:
viz_1=sns.barplot(x="Host_ID", y="P_Count", data=top_host_df,
                 palette='Blues_d')
viz_1.set_title('Hosts with the most listings in NYC')
viz_1.set_ylabel('Count of listings')
viz_1.set_xlabel('Host IDs')
viz_1.set_xticklabels(viz_1.get_xticklabels(), rotation=45)

<b>After first 10 Hosts rest has almost same distribution.</b>

In [None]:
df_groupBy_Host=_DF_Listing_Filtered.groupby('host_id')

In [None]:
df_groupBy_Host.agg(mean_Price=('price',np.mean),max_price=('price',np.max)
                                                   ,min_price=
                                     ('price',np.min),medain_price=('price',np.median))

## Top Hosts based on Maximum Reviews Count

In [None]:
df_host_max_reviews = _DF_Listing_Filtered.groupby('host_id').agg({'number_of_reviews': 'sum'})

In [None]:
df_host_max_reviews.sort_values('number_of_reviews', ascending=False).head(10)

In [None]:
plt.figure(figsize=(20,10))
title = 'Correlation matrix of numerical variables'
sns.heatmap(_DF_Listing_Filtered.corr(), square=True, cmap='RdYlGn')
plt.title(title)
plt.ioff()

In [None]:
# See https://www.kaggle.com/biphili/hospitality-in-era-of-airbnb
title = 'Neighbourhood Group Location'
plt.figure(figsize=(10,6))
sns.scatterplot(_DF_Listing_Filtered.longitude,_DF_Listing_Filtered.latitude,
                hue=_DF_Listing_Filtered.clean_neighbourhood).set_title(title)
plt.ioff()

title = 'Room type location per Neighbourhood Group'
plt.figure(figsize=(10,6))
sns.scatterplot(_DF_Listing_Filtered.longitude,_DF_Listing_Filtered.latitude,
                hue=_DF_Listing_Filtered.clean_room_type).set_title(title)
plt.ioff()

In [None]:
title = 'Room type location per Neighbourhood Group'
sns.catplot(x='clean_room_type', kind="count", hue="clean_neighbourhood", data=_DF_Listing_Filtered);
plt.title(title)
plt.ioff()

# Ananlysis Based On Min and Max Nights Spent

In [None]:
_DF_Listing_Filtered.describe()['minimum_nights']

In [None]:
_DF_Listing_Filtered.describe()['maximum_nights']

In [None]:
df_reviews_by_neighbourhood_nights_min=df_neighbouthood_groups.agg(mean_nights=('minimum_nights',np.mean),max_nights=('minimum_nights',np.max)
                                                   ,min_reviews=
                                     ('minimum_nights',np.min),medain_nights=('minimum_nights',np.median),count_nights=('minimum_nights',np.sum))

In [None]:
df_reviews_by_neighbourhood_nights_min

In [None]:
f_reviews_by_neighbourhood_nights_max=df_neighbouthood_groups.agg(mean_nights=('maximum_nights',np.mean),max_nights=('maximum_nights',np.max)
                                                   ,min_reviews=
                                     ('maximum_nights',np.min),medain_nights=('maximum_nights',np.median),count_nights=('maximum_nights',np.sum))

In [None]:
f_reviews_by_neighbourhood_nights_max

# Get Sentiments Neighbour Hood Comments

In [None]:
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer

# import nltk
# nltk.download('vader_lexicon')

sid = SentimentIntensityAnalyzer()

for_sentiment = 'a person is a person no matter how small dr seuss i teach the smallest students with the biggest enthusiasm \
for learning my students learn in many different ways using all of our senses and multiple intelligences i use a wide range\
of techniques to help all my students succeed students in my class come from a variety of different backgrounds which makes\
for wonderful sharing of experiences and cultures including native americans our school is a caring community of successful \
learners which can be seen through collaborative student project based learning in and out of the classroom kindergarteners \
in my class love to work with hands on materials and have many different opportunities to practice a skill before it is\
mastered having the social skills to work cooperatively with friends is a crucial aspect of the kindergarten curriculum\
montana is the perfect place to learn about agriculture and nutrition my students love to role play in our pretend kitchen\
in the early childhood classroom i have had several kids ask me can we try cooking with real food i will take their idea \
and create common core cooking lessons where we learn important math and writing concepts while cooking delicious healthy \
food for snack time my students will have a grounded appreciation for the work that went into making the food and knowledge \
of where the ingredients came from as well as how it is healthy for their bodies this project would expand our learning of \
nutrition and agricultural cooking recipes by having us peel our own apples to make homemade applesauce make our own bread \
and mix up healthy plants from our classroom garden in the spring we will also create our own cookbooks to be printed and \
shared with families students will gain math and literature skills as well as a life long enjoyment for healthy cooking \
nannan'
ss = sid.polarity_scores(for_sentiment)

for k in ss:
    print('{0}: {1}, '.format(k, ss[k]), end='')

# we can use these 4 things as features/attributes (neg, neu, pos, compound)
# neg: 0.0, neu: 0.753, pos: 0.247, compound: 0.93

In [None]:
#https://medium.com/analytics-vidhya/simplifying-social-media-sentiment-analysis-using-vader-in-python-f9e6ec6fc52f
clean_neighborhood_overviews=  _DF_Listing_Filtered['clean_neighborhood_overview']
clean_neighborhood_overviews_sentiments = []
for text in tqdm(clean_neighborhood_overviews):
    res = sid.polarity_scores(text)
    clean_neighborhood_overviews_sentiments.append(res['compound']) #Considering compound as a criteria.

_DF_Listing_Filtered['clean_neighborhood_overview_sentiment'] = clean_neighborhood_overviews_sentiments

In [None]:

_DF_Listing_Filtered['clean_neighborhood_overview_sentiment']

In [None]:
_DF_Listing_Filtered.describe()['clean_neighborhood_overview_sentiment']

In [None]:
from matplotlib import pyplot
title = 'Avg Sentiments of Host per Room Type'
result = _DF_Listing_Filtered.groupby(["clean_room_type"])['clean_neighborhood_overview_sentiment'].aggregate(np.mean).reset_index().sort_values('clean_neighborhood_overview_sentiment')

a4_dims = (11, 8)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(x='clean_room_type', y="clean_neighborhood_overview_sentiment", ax=ax,
            data=_DF_Listing_Filtered, order=result['clean_room_type'])
plt.title(title)
plt.ioff()

In [None]:
from matplotlib import pyplot
title = 'Avg Sentiments of Host per Neighbour Hood '
result = _DF_Listing_Filtered.groupby(["clean_neighbourhood"])['clean_neighborhood_overview_sentiment'].aggregate(np.mean).reset_index().sort_values('clean_neighborhood_overview_sentiment')

a4_dims = (25, 15)
fig, ax = pyplot.subplots(figsize=a4_dims)
sns.barplot(x='clean_neighbourhood', y="clean_neighborhood_overview_sentiment", ax=ax,
            data=_DF_Listing_Filtered, order=result['clean_neighbourhood'])
plt.title(title)
plt.ioff()

In [None]:
df_nn_max_reviews = _DF_Listing_Filtered.groupby('clean_neighbourhood').agg({'clean_neighborhood_overview_sentiment': 'mean'})

In [None]:
df_nn_max_reviews

In [None]:
_DF_Listing_EDA.info(verbose = True)