### Introduction

Istanbul is a city with their huge population.

In [1]:
import numpy as np # library to handle data in a vectorized manner

import pandas as pd # library for data analsysis
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
pd.options.display.float_format = '{:,.2f}'.format

import json # library to handle JSON files

#!pip install geopy
from geopy.geocoders import Nominatim # convert an address into latitude and longitude values

from bs4 import BeautifulSoup # Library for webscrabing
import requests # library to handle requests
from pandas.io.json import json_normalize # tranform JSON file into a pandas dataframe

# Matplotlib and associated plotting modules
import matplotlib.cm as cm
import matplotlib.colors as colors
import matplotlib.pyplot as plt
import seaborn as sns

sns.set()
# import k-means from clustering stage
from sklearn.cluster import KMeans

import warnings  
warnings.filterwarnings("ignore")  # library to ignore warnings
#!pip install folium
import folium # map rendering library

print('Libraries imported.')

Libraries imported.


In [11]:
liste = [1,2,3,4,5]
liste2 = [1,2,3,4,5]

In [15]:
yeni = list(zip(liste,liste2))

In [17]:
pd.DataFrame(yeni,columns=['liste','liste2'])

Unnamed: 0,liste,liste2
0,1,1
1,2,2
2,3,3
3,4,4
4,5,5


### Getting Istanbul Location 

In [2]:
address = 'Istanbul, Turkey'

geolocator = Nominatim(user_agent="istanbul")
location = geolocator.geocode(address)
latitude = location.latitude
longitude = location.longitude
print('The geograpical coordinate of Istanbul are {}, {}.'.format(latitude, longitude))

The geograpical coordinate of Istanbul are 41.0091982, 28.9662187.


### Creating Map of Istanbul

In [3]:
map_Istanbul = folium.Map(location=[latitude, longitude], zoom_start=11)
folium.Marker(location = [latitude,longitude],popup = 'Istanbul').add_to(map_Istanbul)
map_Istanbul

### Getting Districts of Istanbul from Wikipedia

In [5]:
url = 'https://en.wikipedia.org/wiki/List_of_districts_of_Istanbul'
source = requests.get(url).text 

In [6]:
# Creating soup object
soup = BeautifulSoup(source,'lxml')


In [7]:
table = soup.find_all('table')

In [8]:
df1 = pd.read_html(str(table), flavor='bs4')[0]
df2 = pd.read_html(str(table), flavor='bs4')[1]

## Data Preparation

In [9]:
df = pd.concat([df1,df2.drop('District',axis=1)],axis=1)

In [10]:
df.head()

Unnamed: 0,District,Population 31.12.2021,Area (km²),Density (per km²),Mensual household income TL (USD),Annual household income TL (USD)
0,Adalar,16372,11.05,1482,6.652₺ (918$),"79.821₺ (10,978$)"
1,Arnavutköy,312023,450.35,693,2.030₺ (279$),"24.360₺ (3,350$)"
2,Ataşehir,427217,25.23,16933,6.577₺ (904$),"78.924₺ (10,854$)"
3,Avcılar,457981,42.01,10902,3.662₺ (503$),"43.938₺ (6,064$)"
4,Bağcılar,744351,22.36,33289,3.197₺ (441$),"38.367₺ (5,295$)"


In [11]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43 entries, 0 to 42
Data columns (total 6 columns):
 #   Column                             Non-Null Count  Dtype 
---  ------                             --------------  ----- 
 0   District                           43 non-null     object
 1   Population 31.12.2021              43 non-null     int64 
 2   Area (km²)                         43 non-null     object
 3   Density (per km²)                  43 non-null     int64 
 4   Mensual household income TL (USD)  40 non-null     object
 5   Annual household income TL (USD)   40 non-null     object
dtypes: int64(2), object(4)
memory usage: 2.1+ KB


In [12]:
df.tail()

Unnamed: 0,District,Population 31.12.2021,Area (km²),Density (per km²),Mensual household income TL (USD),Annual household income TL (USD)
38,Zeytinburnu,293839,11.59,25353,3.644₺ (502$),6.036₺ (6.036$)
39,Europe (25 districts),10224323,"3,474.35 [2]",2943,4.308₺ (594$)[4],"51.691₺ (7,134$)[5]"
40,Asia (14 districts),5616577,"1,868.87 [2]",3005,,
41,Urban (36 districts)[a],15514128,"2,576.85 [2]",6021,,
42,TOTAL,15840900,"5,343.22 [2][3]",2965,,


### Dropping Unneccessary Rows

In [13]:
df = df[:39] # We get rid of last 3 columns.

### Changing the Column Names

In [14]:
column_names = ['District','Population','Area','Density','Mensual_Household_Income','Annual_Income']

In [15]:
df.columns = column_names

### Preprocessing Columns and Reformatting

- In this section we will delete symbol,paranthesis, and we will get only try prices and deleting decimals from Area since it won't impact our analysis further. 

#### 'Mensual_Household'

In [16]:
df['Mensual_Household_Income'] = df['Mensual_Household_Income'].apply(lambda x : str(x).split('(')[0].strip(' ').replace('₺','').replace('.',''))

In [17]:
df['Mensual_Household_Income'] = df['Mensual_Household_Income'].astype('int64')

#### Annual Income

In [18]:
df['Annual_Income'] = df['Annual_Income'].apply(lambda x : str(x).split('(')[0].strip(' ').replace('₺','').replace('.',''))

In [19]:
df['Annual_Income'] = df['Annual_Income'].astype('int64')

#### 'Area'

In [20]:
df['Area'] = df['Area'].apply(lambda x : x.split('.')[0])

In [21]:
df['Area'] = df['Area'].astype('int64')

#### Let's see our cleaned dataset

In [22]:
df.head()

Unnamed: 0,District,Population,Area,Density,Mensual_Household_Income,Annual_Income
0,Adalar,16372,11,1482,6652,79821
1,Arnavutköy,312023,450,693,2030,24360
2,Ataşehir,427217,25,16933,6577,78924
3,Avcılar,457981,42,10902,3662,43938
4,Bağcılar,744351,22,33289,3197,38367


In [55]:
df.to_csv('df.csv',index=False) ## Save our cleaned dataset

In [56]:
df = pd.read_csv('df.csv')

In [23]:
df.head()

Unnamed: 0,District,Population,Area,Density,Mensual_Household_Income,Annual_Income
0,Adalar,16372,11,1482,6652,79821
1,Arnavutköy,312023,450,693,2030,24360
2,Ataşehir,427217,25,16933,6577,78924
3,Avcılar,457981,42,10902,3662,43938
4,Bağcılar,744351,22,33289,3197,38367


### Getting Location of Each Borough

In [24]:
Latitude = []
Longitude = []
for i in df['District'].values:
    geolocator = Nominatim(user_agent="Istanbul")
    location = geolocator.geocode(i)
    latitude = location.latitude
    longitude = location.longitude
    Latitude.append(latitude)
    Longitude.append(longitude)

In [25]:
## creating columns of latitude and longitude
df['Latitude'] = Latitude
df['Longitude'] = Longitude

In [26]:
df.head()

Unnamed: 0,District,Population,Area,Density,Mensual_Household_Income,Annual_Income,Latitude,Longitude
0,Adalar,16372,11,1482,6652,79821,40.872361,29.130448
1,Arnavutköy,312023,450,693,2030,24360,41.184471,28.741245
2,Ataşehir,427217,25,16933,6577,78924,40.984749,29.10672
3,Avcılar,457981,42,10902,3662,43938,40.980135,28.717547
4,Bağcılar,744351,22,33289,3197,38367,41.033899,28.857898


### Creating Map Showing all Our Districts Location

In [27]:
map_Istanbul
for lat, lng, label in zip(df['Latitude'],df['Longitude'],df['District']):
    label = folium.Popup(label, parse_html=True)
    folium.CircleMarker(
        [lat, lng],
        radius=5,
        popup=label,
        color='blue',
        fill=True,
        fill_color='#3186cc',
        fill_opacity=0.7,
        parse_html=False).add_to(map_Istanbul)  
map_Istanbul

## Getting Venues Data using FoursquareAPI

In [34]:
CLIENT_ID = 'VHSRNVSILNXIFQX235HYJLCMKWK1G0IEARKJIAI11ESIBXVX' # your Foursquare ID
CLIENT_SECRET = 'KJO4ZU3EQD150F5VD5RXFHI3WIDR4WS1USF2XK04N4PIZB21' # your Foursquare Secret
VERSION = '20210414' # Foursquare API version

In [35]:
lat = 40.991572
long= 29.027017
LIMIT = 50
radius = 500 

In [36]:
url = 'https://api.foursquare.com/v2/venues/search?client_id={}&client_secret={}&ll={},{}&v={}&radius={}&limit={}'.format(CLIENT_ID, CLIENT_SECRET, lat, long,VERSION,radius, LIMIT)

'https://api.foursquare.com/v2/venues/search?client_id=VHSRNVSILNXIFQX235HYJLCMKWK1G0IEARKJIAI11ESIBXVX&client_secret=KJO4ZU3EQD150F5VD5RXFHI3WIDR4WS1USF2XK04N4PIZB21&ll=40.991572,29.027017&v=20210414&radius=500&limit=50'

In [37]:
categories_url = 'https://api.foursquare.com/v2/venues/categories?client_id={}&client_secret={}&v={}'.format(
            CLIENT_ID, 
            CLIENT_SECRET, 
            VERSION)
            
# make the GET request
results = requests.get(categories_url).json()

In [38]:
categories_list = []
# Let's print only the top-level categories and their IDs and also add them to categories_list

def print_categories(categories, level=0, max_level=0):    
    if level>max_level: return
    out = ''
    out += '-'*level
    for category in categories:
        print(out + category['name'] + ' (' + category['id'] + ')')
        print_categories(category['categories'], level+1, max_level)
        categories_list.append((category['name'], category['id']))
        
print_categories(results['response']['categories'], 0, 0)

Arts & Entertainment (4d4b7104d754a06370d81259)
College & University (4d4b7105d754a06372d81259)
Event (4d4b7105d754a06373d81259)
Food (4d4b7105d754a06374d81259)
Nightlife Spot (4d4b7105d754a06376d81259)
Outdoors & Recreation (4d4b7105d754a06377d81259)
Professional & Other Places (4d4b7105d754a06375d81259)
Residence (4e67e38e036454776db1fb3a)
Shop & Service (4d4b7105d754a06378d81259)
Travel & Transport (4d4b7105d754a06379d81259)


In [39]:

def get_venues_count(lat,long, radius, categoryId):
    explore_url = 'https://api.foursquare.com/v2/venues/explore?client_id={}&client_secret={}&v={}&ll={},{}&radius={}&categoryId={}'.format(
                CLIENT_ID, 
                CLIENT_SECRET, 
                VERSION,
                lat,
                long,
                radius,
                categoryId)

    # make the GET request
    return requests.get(explore_url).json()['response']['totalResults']

In [182]:
#Create new dataframe to store venues data
df_venues = df.copy()
for c in categories_list:
    df_venues[c[0]] = 0
    
df_venues.head()

Unnamed: 0,District,Arts & Entertainment,College & University,Event,Food,Nightlife Spot,Outdoors & Recreation,Professional & Other Places,Residence,Shop & Service,Travel & Transport
0,Adalar,0,0,0,0,0,0,0,0,0,0
1,Arnavutköy,0,0,0,0,0,0,0,0,0,0
2,Ataşehir,0,0,0,0,0,0,0,0,0,0
3,Avcılar,0,0,0,0,0,0,0,0,0,0
4,Bağcılar,0,0,0,0,0,0,0,0,0,0


In [41]:
#Request number of venues, store result as CSV
#for i, row in df_venues.iterrows():
#   for c in categories_list:        
#       df_venues.loc[i, c[0]] = get_venues_count(df_venues.Latitude.iloc[i], df_venues.Longitude.iloc[i], radius=1000, categoryId=c[1])
#   print(i)

0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38


In [42]:
#df_venues.to_csv('df_venues2022_1.csv',index=False)