In [331]:
import numpy as np
import pandas as pd
import csv
from sklearn.impute import KNNImputer
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns

In [332]:
#READING DATA SET

data = pd.read_csv("startupdata.csv")

In [333]:
#Couting null values in each column.
#print(data.isnull().sum())

In [334]:
# data.info()

In [335]:
#Before filling null values.
#print(data.isnull().sum())

In [336]:
#Filling missing values.
#Now filling the null values.
def imputing_numeric_missing_values(dataset,n_neighbors=10):
    numerical_column_names = dataset.select_dtypes([np.number]).columns
    knn= KNNImputer()
    knn_dataset= knn.fit_transform(dataset[numerical_column_names])
    
    dataset[numerical_column_names]=pd.DataFrame(knn_dataset)
    return dataset

data=imputing_numeric_missing_values(data)

In [337]:
#After fillingout null values.
#print(data.isnull().sum())

In [338]:
#removing the remaining null values.
#But before that we need to rename Unnamed: 6   to Unnamed_6  and state_code.1 to state_code_1 
data =  data.rename(columns={"Unnamed: 6": "Unnamed_6","state_code.1":"state_code_1"})


In [339]:
#data["state_code_1"]

In [340]:
dist = data.Unnamed_6.value_counts(normalize=True)
nan_Unnamed_6 = data['Unnamed_6'].isnull()
data.loc[nan_Unnamed_6,'Unnamed_6'] = np.random.choice(dist.index, size=len(data[nan_Unnamed_6]),p=dist.values)

dist = data.closed_at.value_counts(normalize=True)
nan_closed_at = data['closed_at'].isnull()
data.loc[nan_closed_at,'closed_at'] = np.random.choice(dist.index, size=len(data[nan_closed_at]),p=dist.values)

dist = data.state_code_1.value_counts(normalize=True)
nan_state_code_1 = data['state_code_1'].isnull()
data.loc[nan_state_code_1,'state_code_1'] = np.random.choice(dist.index, size=len(data[nan_state_code_1]),p=dist.values)




In [341]:
#After fillingout null values.
#data.info()

In [342]:
#Converting string values to numeric form.
# here we convert the string based columns into integer or in numeric form.
#Converting to feature name
# state_code
# zip_code
# id
# city
# Unnamed_6
# name
# founded_at
# closed_at
# first_funding_at
# last_funding_at
# state_code_1
# category_code
# object_id
# status

ord_enc = OrdinalEncoder()
enc = LabelEncoder()   

enc.fit(data['state_code']) 
data["state_code"] = ord_enc.fit_transform(data[["state_code"]])

enc.fit(data['zip_code']) 
data["zip_code"] = ord_enc.fit_transform(data[["zip_code"]])

enc.fit(data['id']) 
data["id"] = ord_enc.fit_transform(data[["id"]])


enc.fit(data['city']) 
data["city"] = ord_enc.fit_transform(data[["city"]])

enc.fit(data['Unnamed_6']) 
data["Unnamed_6"] = ord_enc.fit_transform(data[["Unnamed_6"]])

enc.fit(data['founded_at']) 
data["founded_at"] = ord_enc.fit_transform(data[["founded_at"]])

enc.fit(data['closed_at']) 
data["closed_at"] = ord_enc.fit_transform(data[["closed_at"]])

enc.fit(data['first_funding_at']) 
data["first_funding_at"] = ord_enc.fit_transform(data[["first_funding_at"]])

enc.fit(data['last_funding_at']) 
data["last_funding_at"] = ord_enc.fit_transform(data[["last_funding_at"]])

enc.fit(data['object_id']) 
data["object_id"] = ord_enc.fit_transform(data[["object_id"]])

to_drop = ['state_code_1']
data.drop(to_drop, axis=1, inplace=True)

enc.fit(data['name']) 
data["name"] = ord_enc.fit_transform(data[["name"]])


enc.fit(data['category_code']) 
data["category_code"] = ord_enc.fit_transform(data[["category_code"]])


#Now we will convert the status to binary value.
# 1. Binarizing the class names.
diag_map = {'acquired':1, 'closed':0}
data['status'] = data['status'].map(diag_map)
# 1.1 chaging the name of colums status to : is_aquired
data.rename(columns={'status':'is_acquired'}, inplace=True)

In [343]:

# data.info()

In [344]:
#Now standarizing some specific columns.
#Since we have decided to keep is_acquired as out class label column. 
#for classification purposed. So except this 
#and columns which are showing labels or behaving like
#categorical data we will not convert them.
from sklearn import preprocessing

min_max_scaler = preprocessing.MinMaxScaler()

data[['age_first_funding_year', 'age_last_funding_year','age_first_milestone_year',
      'age_last_milestone_year']] = min_max_scaler.fit_transform(data[['age_first_funding_year', 'age_last_funding_year','age_first_milestone_year',
      'age_last_milestone_year']])



In [345]:
data[['age_first_funding_year', 'age_last_funding_year','age_first_milestone_year',
        'age_last_milestone_year']]

Unnamed: 0,age_first_funding_year,age_last_funding_year,age_first_milestone_year,age_last_milestone_year
0,0.365061,0.389409,0.484841,0.432611
1,0.458030,0.615461,0.544988,0.442121
2,0.325749,0.325749,0.402200,0.290656
3,0.393572,0.464142,0.519179,0.410478
4,0.292368,0.346291,0.365677,0.222272
...,...,...,...,...
918,0.309102,0.309102,0.379636,0.363880
919,0.526742,0.590579,0.519179,0.410478
920,0.566939,0.566939,0.596462,0.505232
921,0.316894,0.383922,0.384220,0.342094


In [346]:
#Now making selection of attrbutes from dataset.
#So we have two reasons to drop a column from dataset. Either that is un necessary like it may be the id of some column and 
#other reason can be that there exist some other columns which are highly co-related to it, due to that we can remove all 
#and can keep only one column.
#So first of all we need to see that co-relation graph.

# Now ploting the graph
def draw_heatmap(dataset):
    
    
    f, ax = plt.subplots(figsize = (18, 18))
    
    corrMatt = dataset.corr(method='spearman')
    
    sns.heatmap(corrMatt, annot = True, linewidth = 0.5, fmt = '.1f', ax = ax)
    #plt.show()
    
    
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    
numerical_df_1=data.select_dtypes(numerics)
numerical_column_names = data.select_dtypes(numerics).columns

# draw_heatmap(numerical_df_1)

In [347]:

# we have summed that for strong relation threshold is 0.7>= so we have will have few columns which need to be remove due to strong relation. Following will be removed.

# o	See there is strong co-relation between age_first_funding_year and last_funding_year so we need to remove either of them. I am removing  first_funding_year.

# o	See there is strong co-relation between longitude and is_CA so we need to remove either of them. I am removing  longitude. 

# o	See there is strong co-relation between age_first_milestone_year and age_last_milestone_ so we need to remove either of them. I am choosing age_last_milestone_

# and we will drop the all keys columns aswell.
# 1.	Unnamed: 0
# 2.	Unnamed: 6
# 3.	id
# 4.	object_id




to_drop = ['longitude','Unnamed: 0','Unnamed_6','id',"object_id"]
data.drop(to_drop, axis=1, inplace=True)


In [348]:
#data.info()


In [349]:
#Redarwing after deletion of attrbutes.
numerical_df_1=data.select_dtypes(numerics)
numerical_column_names = data.select_dtypes(numerics).columns

# draw_heatmap(numerical_df_1)

In [350]:
data.head()

Unnamed: 0,state_code,latitude,zip_code,city,name,labels,founded_at,closed_at,first_funding_at,last_funding_at,...,is_othercategory,has_VC,has_angel,has_roundA,has_roundB,has_roundC,has_roundD,avg_participants,is_top500,is_acquired
0,2.0,42.35888,250.0,173.0,75.0,1.0,15.0,8.0,295.0,7.0,...,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1
1,2.0,37.238916,336.0,108.0,781.0,1.0,8.0,7.0,216.0,205.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,4.75,1.0,1
2,2.0,32.901049,251.0,173.0,585.0,1.0,106.0,7.0,278.0,322.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,4.0,1.0,1
3,2.0,37.320309,333.0,55.0,712.0,1.0,10.0,59.0,218.0,379.0,...,0.0,0.0,0.0,0.0,1.0,1.0,1.0,3.3333,1.0,1
4,2.0,37.779281,295.0,174.0,351.0,0.0,183.0,24.0,490.0,340.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0


In [351]:
    # Now we will again remove 
    # •	labels
    # •	state_code
    # •	is_CA

to_drop = ['labels','state_code',"is_CA"]
data.drop(to_drop, axis=1, inplace=True)


In [352]:
# #Redarwing after deletion of attrbutes.
numerical_df_1=data.select_dtypes(numerics)
numerical_column_names = data.select_dtypes(numerics).columns

# draw_heatmap(numerical_df_1)

In [353]:
# data[['age_first_funding_year', 'age_last_funding_year','age_first_milestone_year',
#         'age_last_milestone_year']]

In [354]:
# to_drop = ['age_last_milestone_year',]
# data.drop(to_drop, axis=1, inplace=True)


In [355]:
#Redarwing after deletion of attrbutes.
numerical_df_1=data.select_dtypes(numerics)
numerical_column_names = data.select_dtypes(numerics).columns

# draw_heatmap(numerical_df_1)

In [356]:
#generating the CSV file
data.to_csv ('Dataset_02_standarized_.csv', index = False, header=True)
# data.to_csv ('Dataset_02_non_standarized_.csv', index = False, header=True)