## Step 1 - Importing Libraries & Dataset
---

In [1]:
#IMPORTING LIBRARIES

import pandas as pd
import numpy as np
pd.options.display.max_rows = 999

In [2]:
#IMPORTING DATAFRAME

df = pd.read_csv('comp_original.csv')

## Step 2 - Quick overview
---

In [3]:
#CHECKING DATASET

df.head()

Unnamed: 0,permalink,name,homepage_url,category_list,funding_total_usd,status,country_code,state_code,region,city,funding_rounds,founded_at,first_funding_at,last_funding_at
0,/organization/-fame,#fame,http://livfame.com,Media,10000000,operating,IND,16,Mumbai,Mumbai,1.0,,2015-01-05,2015-01-05
1,/organization/-qounter,:Qounter,http://www.qounter.com,Application Platforms|Real Time|Social Network...,700000,operating,USA,DE,DE - Other,Delaware City,2.0,2014-09-04,2014-03-01,2014-10-14
2,/organization/-the-one-of-them-inc-,"(THE) ONE of THEM,Inc.",http://oneofthem.jp,Apps|Games|Mobile,3406878,operating,,,,,1.0,,2014-01-30,2014-01-30
3,/organization/0-6-com,0-6.com,http://www.0-6.com,Curated Web,2000000,operating,CHN,22,Beijing,Beijing,1.0,2007-01-01,2008-03-19,2008-03-19
4,/organization/004-technologies,004 Technologies,http://004gmbh.de/en/004-interact,Software,-,operating,USA,IL,"Springfield, Illinois",Champaign,1.0,2010-01-01,2014-07-24,2014-07-24


In [4]:
#CHECKING DATASET DIMENSIONS

df.shape

(13826, 14)

In [5]:
#CHECKING THE COLUMNS IN DATASET

df.columns

Index(['permalink', 'name', 'homepage_url', 'category_list',
       'funding_total_usd', 'status', 'country_code', 'state_code', 'region',
       'city', 'funding_rounds', 'founded_at', 'first_funding_at',
       'last_funding_at'],
      dtype='object')

## Step 3 - Cleaning

In [6]:
#DROPPING THE COLUMNS WHICH ARE IRRELEVANT

df.drop(columns = ['permalink','homepage_url','state_code','region','status','name','category_list','founded_at','first_funding_at','last_funding_at'], axis=1,inplace=True)

In [7]:
#CHECKING THE NEW DIMENSIONS OF DATASET

df.shape

(13826, 4)

In [8]:
#CHECKING ALL THE MISSING VALUES

df.isnull().sum()

funding_total_usd       0
country_code         1330
city                 1561
funding_rounds          1
dtype: int64

In [9]:
#DROPPING ALL MISSING VALUES

df.dropna(subset = ['country_code','city','funding_rounds'], inplace=True)

In [10]:
#CHECKING IF ALL MISSING VALUES HAVE BEEN DROPPED

df.isnull().sum()

funding_total_usd    0
country_code         0
city                 0
funding_rounds       0
dtype: int64

In [11]:
#CHECKING THE NEW DIMENSIONS OF DATASET

df.shape

(12265, 4)

In [12]:
#CHECKING THE TYPE OF EACH COLUMN

df.dtypes

funding_total_usd     object
country_code          object
city                  object
funding_rounds       float64
dtype: object

In [13]:
#DROPPING ALL THE ROWS IN COLUMN 'FUNDING_TOTAL_USD' WHO DON'T HAVE AN AMOUNT

df.drop(df[df['funding_total_usd'].str.find('-') != -1].index, inplace=True)
df.shape

(10167, 4)

In [14]:
df.head()

Unnamed: 0,funding_total_usd,country_code,city,funding_rounds
0,10000000,IND,Mumbai,1.0
1,700000,USA,Delaware City,2.0
3,2000000,CHN,Beijing,1.0
5,41250,HKG,Hong Kong,1.0
6,762851,CAN,Vancouver,2.0


In [15]:
#CONVERTING THE 'FUNDING_TOTAL_USD' COLUMN IN FLOAT

df['funding_total_usd'] = df['funding_total_usd'].astype(float)
df.dtypes

funding_total_usd    float64
country_code          object
city                  object
funding_rounds       float64
dtype: object

In [16]:
#GROUPING THE COUNTRY CODES BY REGION OF THE WORLD

def categories_countries(x):
    if x in ['USA','CAN']: 
        return 'NorthAm'
    elif x in ['GBR','FRA','DEU','IRL','SWE','ESP','NLD','RUS','ITA','DNK','BEL','FIN','CHE']:
        return 'EU'
    elif x in ['CHN','IND','AUS','KOR','JPN','SGP','HKG']:
        return 'AS'
    elif x in ['CHL','BRA']:
        return 'SouthAm'
    else:
        return 'Other'

In [17]:
df['country_code'] = df['country_code'].apply(categories_countries)

In [18]:
df['country_code'].value_counts()

NorthAm    7096
EU         1586
AS          789
Other       554
SouthAm     142
Name: country_code, dtype: int64

## Step 4 - Exporting the dataframe in .csv to use it in other notebooks
---

In [22]:
df.to_csv('comp_modified_v2.csv',index=False)