## Strategic Location using Facebook Analytics

If you were to open your own cafe, would you not want to effortlessly identify the most suitable location to set up your shop? Choosing an optimal physical location is a critical decision for numerous businesses, as many factors contribute to the final choice of the location.
Features selected:
* category of cafe
* category of neighboring cafes
* checkins of localities

In [1]:
#preprocessing
import pandas as pd
import numpy as np

def JsontoDataFrame(jsonFile):
    data=pd.read_json(jsonFile)
    data=data.as_matrix()
    data= data.transpose()
    data=pd.DataFrame(data,columns=['Category','Check-Ins','Latitude','Likes','Longitude','Name'])
    return data

def filterFoodRelated(data):
    totalIndex=0
    remIndex=[]
    keepIndex=[]
    uniqueCategories=[]
    frConstraints=['Diner','Fruit','Vegetable','Caterer','Bar','Pub','Grill','Hotel','Restaurant','Lounge','Pizza Place','Dessert Shop','Coffee','Food','Beverage','Cafe']
    
    for index,rows in data.iterrows():
        #print index,rows['Category']
        for cat in rows['Category']:
            categorysublist=cat.split(' ')
            if len(set(categorysublist).intersection(frConstraints))>0:
                keepIndex.append(index)
                uniqueCategories.append(cat)
                break
            
        totalIndex=index
    
    remIndex=list(set(range(totalIndex+1))-set(keepIndex))
    data=data.drop(data.index[remIndex]).reset_index(drop=True)
    return data,list(set(uniqueCategories))
    
all_data=JsontoDataFrame('fbData.json')
food_data,unique_categories=filterFoodRelated(all_data)

In [2]:
#Label Binarizer Encoding Categories
from sklearn import preprocessing
lb=preprocessing.LabelBinarizer()
lb.fit(unique_categories)

def orCategories(bcat1,bcat2):
    """
    Oring Category Codes 
    """
    result_cat=[]
    for val1,val2 in zip(bcat1,bcat2):
        if val1==1 or val2==1:
            result_cat.append(1)
        else:
            result_cat.append(0)
    #print 'res',result_cat
    return result_cat

def addBinaryCategoryData(data):
    """
    Finding Binary Category Codes of all entries
    """
    binary_category=[]
    for catlist in data["Category"]:
        final_category=[0]*len(lb.classes_)
        for cat in catlist:
            if cat in unique_categories:
                #print cat
                binary_encoded_cat=lb.transform([str(cat)])[0]
                finalcat=orCategories(final_category,binary_encoded_cat)
                binary_category.append(finalcat)
    return binary_category

food_data['Category Code']=pd.Series(addBinaryCategoryData(food_data))



In [3]:
#Finding Neigbouring Food related Joints
from math import radians, cos, sin, asin, sqrt

def haversine_dist(lon1, lat1, lon2, lat2):
    """
    Calculate the great circle distance between two points 
    on the earth (specified in decimal degrees)
    """
    # convert decimal degrees to radians 
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])
    # haversine formula 
    dlon = lon2 - lon1 
    dlat = lat2 - lat1 
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
    c = 2 * asin(sqrt(a)) 
    m = 6367 * c*1000
    return m


def findNeighbors(data,target_latitude,target_longitude,distance,is_food_related=True):
    """
    Finding Neighbors to a given point on map
    """
    neighbor_name=[]
    neighbor_category=[0]*len(lb.classes_)
    neighbor_checkins=0
    if is_food_related:
        for latitude,longitude,name,binary_category,check_in in zip(data['Latitude'],data['Longitude'],data['Name'],data['Category Code'],data['Check-Ins']):
            neighbor_latitide=latitude
            neighbor_longitude=longitude
            if haversine_dist(target_longitude,target_latitude,neighbor_longitude,neighbor_latitide) < distance and target_longitude!=neighbor_longitude and neighbor_latitide!=target_latitude:
                neighbor_name.append(name)
                if is_food_related:
                    neighbor_category=orCategories(neighbor_category,binary_category)
                neighbor_checkins=neighbor_checkins+check_in
    else:
        for latitude,longitude,name,check_in in zip(data['Latitude'],data['Longitude'],data['Name'],data['Check-Ins']):
            neighbor_latitide=latitude
            neighbor_longitude=longitude
            if haversine_dist(target_longitude,target_latitude,neighbor_longitude,neighbor_latitide) < distance and target_longitude!=neighbor_longitude and neighbor_latitide!=target_latitude:
                neighbor_name.append(name)
                neighbor_checkins=neighbor_checkins+check_in
    return neighbor_name,neighbor_category,neighbor_checkins
         
def findAllNeighbors(distance,is_food_related=True,*data):
    DISTANCE=distance
    all_neighbour_name=[]
    all_neighbour_category=[]
    all_neighbour_checkins=[]
    if is_food_related:
        for lat,lon in zip(data[0]['Latitude'],data[0]['Longitude']):
            neighbor_name,neighbor_category,neighbor_checkins=findNeighbors(data[0],lat,lon,DISTANCE,True)
            all_neighbour_name.append(neighbor_name)
            all_neighbour_category.append(neighbor_category)
            all_neighbour_checkins.append(neighbor_checkins)
    
    else:
        food_related_joints=data[0]['Name']
        for lat,lon,name in zip(data[1]['Latitude'],data[1]['Longitude'],data[1]['Name']):
            #Error in this line
            #print name in food_related_joints returns false
            if name[0] in food_related_joints[0]:
                for lat,lon in zip(data[1]['Latitude'],data[1]['Longitude']):
                    neighbor_name,neighbor_category,neighbor_checkins=findNeighbors(data[1],lat,lon,DISTANCE,False)
                    all_neighbour_name.append(neighbor_name)
                    all_neighbour_checkins.append(neighbor_checkins)  

    return all_neighbour_name,all_neighbour_category,all_neighbour_checkins

    

def findAverageNeighborCheckIns(data,is_food_related=True):
    average_neigbor_checkins=[]
    if is_food_related:
        column_total_check_ins='Food-Related Neighbor Total Check-Ins'
        column_neigbor_names='Food-Related Neighbor Names'
    else:
        column_total_check_ins='All Neighbor Total Check-Ins'
        column_neigbor_names='All Neighbor Names'
    for neigbors,check_ins in zip(data[column_neigbor_names],data[column_total_check_ins]):
        average_neigbor_checkins.append(check_ins/len(neigbors))
    return average_neigbor_checkins
    
    
    



In [4]:
#Food Related DataFrame Modification
all_neighbour_name,all_neighbour_category,all_neighbour_checkins=findAllNeighbors(1000,True,food_data,all_data) 

food_data['Food-Related Neighbor Names']=pd.Series(all_neighbour_name)
food_data['Food-Related Neighbor Category Codes']=pd.Series(all_neighbour_category)
food_data['Food-Related Neighbor Total Check-Ins']=pd.Series(all_neighbour_checkins)
#food_data.drop('Food-Related Neighbor Check-Ins', axis=1, inplace=True)
food_data['Food-Related Neighbor Average Check-Ins']=pd.Series(findAverageNeighborCheckIns(food_data))

In [50]:
#All Neighbors DataFrame Modification
all_neighbour_name,all_neighbour_category,all_neighbour_checkins=findAllNeighbors(1000,False,food_data,all_data) 

food_data['All Neighbor Names']=pd.Series(all_neighbour_name)
food_data['All Neighbor Total Check-Ins']=pd.Series(all_neighbour_checkins)
food_data['All Neighbor Average Check-Ins']=pd.Series(findAverageNeighborCheckIns(food_data,False))
#food_data.drop('All Neighbor Category Codes', axis=1, inplace=True)


In [51]:
#Rearranging Columns
food_data=food_data[['Name','Category','Likes','Check-Ins','Latitude','Longitude','Category Code','Food-Related Neighbor Names','Food-Related Neighbor Category Codes',
                    'Food-Related Neighbor Total Check-Ins','Food-Related Neighbor Average Check-Ins','All Neighbor Names','All Neighbor Total Check-Ins'
                    ,'All Neighbor Average Check-Ins']]
data_of_interest=food_data[['Name','Category','Likes','Check-Ins','Category Code','Food-Related Neighbor Category Codes',
                    'Food-Related Neighbor Total Check-Ins','Food-Related Neighbor Average Check-Ins','All Neighbor Total Check-Ins'
                    ,'All Neighbor Average Check-Ins']]


In [52]:
#Transfering to csv
data_of_interest.to_csv('Final Data.csv',encoding='utf8')

In [53]:
X_train=data_of_interest[['Category Code','Food-Related Neighbor Category Codes','Food-Related Neighbor Total Check-Ins'
                         ,'Food-Related Neighbor Average Check-Ins','All Neighbor Total Check-Ins','All Neighbor Average Check-Ins']]
Y_train=data_of_interest[['Check-Ins']]

import warnings
warnings.filterwarnings('ignore')

def convertBinaryArrayToNumber(array):
    number=0
    array.reverse()
    i=0
    for val in array:
        number=number+val*pow(2,i)
        i=i+1
    return number
        
def convertCodeToNumber(data):
    category_decode=[]
    neighbor_category_decode=[]
    for cat_code,n_cat_code in zip(data['Category Code'],data['Food-Related Neighbor Category Codes']):
        category_decode.append(convertBinaryArrayToNumber(cat_code))
        neighbor_category_decode.append(convertBinaryArrayToNumber(n_cat_code))
        #category_decode.append(lb.inverse_transform(cat_code))
        #neighbor_category_decode.append(ib.inverse_transform(n_cat_code))
        
    return category_decode,neighbor_category_decode
        

category_decode,neighbor_category_decode=convertCodeToNumber(X_train)  
X_train['Category Code']=pd.Series(category_decode)
X_train['Food-Related Neighbor Category Codes']=pd.Series(neighbor_category_decode)

In [54]:
#Converting float to int
#uncomment it once for first run
Y_train.to_csv('Target.csv')
Y_train=pd.read_csv('Target.csv',dtype='float64')
Y_train.drop('Unnamed: 0', axis=1, inplace=True)

Y_train=Y_train.values.astype(int)
X_train=X_train.values.astype(int)

In [62]:
#Testing out models
from sklearn.linear_model import LinearRegression

rgr = LinearRegression()
rgr.fit(xtrain,ytrain)
rgr.predict([         7,   43010568,      57641,       8234,      71409,
             11901])[0][0]


9975.2991440954265