In [15]:
import pandas as pd
import numpy as np
import math

import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.decomposition import SparsePCA, NMF
from sklearn.metrics import mean_squared_error #, explained_variance_score

from scipy.sparse import csr_matrix

## Apply TF-IDF to count data

This sort of sparce matrix of frequencies 0 to 1 highly resembles the frequency 
of terms in a document. 

It may be beneficial to apply the inverse log factor practiced by TF-IDF to 
give less weight to on highly common venue categories.
We can think of this as "venue frequency - inverse city freqency" (VF-ICF if you will!)

In [6]:
venue_counts = pd.read_csv('data/counted_world_venues.csv', index_col=0)
venue_counts.set_index('City', drop=True, inplace=True)
venue_counts = venue_counts.fillna(0)
venue_counts

Unnamed: 0_level_0,"Богданов и партнеры""""""","""""dakineshop.ru""""""","117""""""","12""""""","13""""""","17""""""","18""""""","18.""","34""""""","38""""""",...,Zhejiang Restaurant,Zoo,Zoo Exhibit,"quando e dove vuoi tu """"""",Çöp Şiş Place,"Корпорация СБР""","д.24""""""","не Дай Другим""""""","площадка #2""","студия звукозаписи."""
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bangalore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Tangerang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Casablanca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Cairo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Phoenix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Munich,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Bogotá,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Makasar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Izmir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,6.0,9.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0


In [12]:
len(venue_counts[venue_counts['Zoo']>0])

111

In [8]:
venue_frequency = pd.read_csv('data/world_venues_frequency.csv', index_col=0)
venue_frequency

Unnamed: 0_level_0,"Богданов и партнеры""""""","""""dakineshop.ru""""""","117""""""","12""""""","13""""""","17""""""","18""""""","18.""","34""""""","38""""""",...,Zhejiang Restaurant,Zoo,Zoo Exhibit,"quando e dove vuoi tu """"""",Çöp Şiş Place,"Корпорация СБР""","д.24""""""","не Дай Другим""""""","площадка #2""","студия звукозаписи."""
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bangalore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000071,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Tangerang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000256,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Casablanca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000322,0.000322,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Cairo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Phoenix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000539,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Munich,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Bogotá,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000144,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Makasar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Izmir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0000,0.000238,0.000357,0.0,0.000079,0.0,0.0,0.0,0.0,0.0


In [18]:
# IDF = (Total number of documents / Number of documents with word t in it)

num_cities = len(venue_frequency)

inv_city_frequency_dict = dict.fromkeys(venue_counts.columns)
for category in venue_counts.columns:
    icf = math.log(num_cities/len(venue_counts[venue_counts[category]>0]))
    inv_city_frequency_dict[category] = icf

inv_city_frequency_dict

{'  Богданов и партнеры"""': 5.521460917862246,
 ' ""dakineshop.ru"""': 5.521460917862246,
 ' 117"""': 5.521460917862246,
 ' 12"""': 5.521460917862246,
 ' 13"""': 5.521460917862246,
 ' 17"""': 5.521460917862246,
 ' 18"""': 5.521460917862246,
 ' 18."': 4.8283137373023015,
 ' 34"""': 5.521460917862246,
 ' 38"""': 5.521460917862246,
 ' 43"""': 5.521460917862246,
 ' 46"""': 4.8283137373023015,
 ' 8"""': 5.521460917862246,
 ' 9"""': 5.521460917862246,
 ' Balanceo y Suspenciones"': 5.521460917862246,
 ' C.U. (Escuelita)"': 5.521460917862246,
 ' Cabelo e Boteco"""': 5.521460917862246,
 ' Campesinos"': 5.521460917862246,
 ' Dani e Sr. Santos"""': 5.521460917862246,
 ' Drinks y Snacks"': 5.521460917862246,
 ' Exposición Temporal"': 5.521460917862246,
 ' Fradia"': 5.521460917862246,
 ' Guangzhou"': 5.521460917862246,
 ' Hyundai': 5.521460917862246,
 ' ILP KS Tubun"': 5.521460917862246,
 ' IPN"': 5.521460917862246,
 ' Iztapalapa"': 5.521460917862246,
 ' Jakarta Pusat"': 5.521460917862246,
 ' Jaru

In [28]:
vficf = venue_frequency.copy()

In [30]:
for cat in venue_frequency.columns:
    vficf[cat] = venue_frequency[cat]*inv_city_frequency_dict[cat]

vficf

Unnamed: 0_level_0,"Богданов и партнеры""""""","""""dakineshop.ru""""""","117""""""","12""""""","13""""""","17""""""","18""""""","18.""","34""""""","38""""""",...,Zhejiang Restaurant,Zoo,Zoo Exhibit,"quando e dove vuoi tu """"""",Çöp Şiş Place,"Корпорация СБР""","д.24""""""","не Дай Другим""""""","площадка #2""","студия звукозаписи."""
City,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Bangalore,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000066,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Tangerang,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000240,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Casablanca,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000261,0.000302,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Cairo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Phoenix,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000504,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Munich,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Bogotá,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000134,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Makasar,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000000,0.000000,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
Izmir,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000000,0.000193,0.000335,0.0,0.000351,0.0,0.0,0.0,0.0,0.0


#### After running VF-ICF, we can see that zoos must be popular and range many cities since their frequencies were downweighted

In [31]:
vficf.to_csv('data/vficf.csv')