In [2]:
# Import all relevant libraries for cleaning data
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [3]:
# Importing data

df=pd.read_csv('Kickstarter Campaigns DataSet.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,id,name,currency,launched_at,backers_count,blurb,country,deadline,slug,status,usd_pledged,sub_category,main_category,creator_id,blurb_length,goal_usd,city,duration
0,0,1714728788,Urban Farm Kits: Our Solution to City Farming,CAD,2018-02-10 22:47:52,16,"Earn $1,000 - $20,000+ by transforming idle re...",CA,2018-04-11 21:47:52,urban-farm-kits-our-solution-to-city-farming,failed,577.902078,food,Farms,41046209,122,785.82065,Victoria,60.0
1,1,1639965256,Reflections on the LHC Safety Report,USD,2014-11-21 10:42:26,7,"Need Peer Review, Open Source style",US,2014-12-15 10:42:26,reflections-on-the-lhc-safety-report-review-an...,successful,211.0,publishing,Academic,1877377173,35,93.0,San Diego,24.0
2,2,1582062943,Ultra-Custom Chocolate Bars,USD,2020-10-19 05:26:59,99,Ultra-customized bean to bar chocolate. Any fl...,US,2020-11-18 05:26:59,ultra-custom-chocolate-bars-0,successful,4697.0,food,Small Batch,205122366,122,1500.0,Philadelphia,30.0
3,3,1888984372,GlassWeb: Internet Monitoring,USD,2019-05-29 16:01:34,3,A module device to monitor internet connection...,US,2019-07-28 16:01:34,glassweb-internet-monitoring,failed,222.0,technology,Gadgets,1458852038,69,25000.0,Chicago,60.0
4,4,1409770586,Dreaming of Streaming - 96.9 FM KMRD-LP Madrid...,USD,2015-11-19 20:36:01,100,"Madrid's low power FM community radio station,...",US,2015-12-19 20:36:01,dreaming-of-streaming-969-fm-kmrd-lp-madrid-ne...,successful,5410.0,publishing,Radio & Podcasts,1845213665,131,5000.0,Madrid,30.0


Looking at this dataframe there seems to be columns that we don't need. 
First we don't need the unnamed column, we also don't need the creator_id,blurb,slug,deadline(because we have a column called duration)

In [4]:
df.columns

Index(['Unnamed: 0', 'id', 'name', 'currency', 'launched_at', 'backers_count',
       'blurb', 'country', 'deadline', 'slug', 'status', 'usd_pledged',
       'sub_category', 'main_category', 'creator_id', 'blurb_length',
       'goal_usd', 'city', 'duration'],
      dtype='object')

In [5]:
df.drop(['Unnamed: 0','blurb', 'slug','creator_id','deadline'],axis=1,inplace=True)

In [6]:
df.head()

Unnamed: 0,id,name,currency,launched_at,backers_count,country,status,usd_pledged,sub_category,main_category,blurb_length,goal_usd,city,duration
0,1714728788,Urban Farm Kits: Our Solution to City Farming,CAD,2018-02-10 22:47:52,16,CA,failed,577.902078,food,Farms,122,785.82065,Victoria,60.0
1,1639965256,Reflections on the LHC Safety Report,USD,2014-11-21 10:42:26,7,US,successful,211.0,publishing,Academic,35,93.0,San Diego,24.0
2,1582062943,Ultra-Custom Chocolate Bars,USD,2020-10-19 05:26:59,99,US,successful,4697.0,food,Small Batch,122,1500.0,Philadelphia,30.0
3,1888984372,GlassWeb: Internet Monitoring,USD,2019-05-29 16:01:34,3,US,failed,222.0,technology,Gadgets,69,25000.0,Chicago,60.0
4,1409770586,Dreaming of Streaming - 96.9 FM KMRD-LP Madrid...,USD,2015-11-19 20:36:01,100,US,successful,5410.0,publishing,Radio & Podcasts,131,5000.0,Madrid,30.0


In [8]:
# Let's see if we have any null values

df.isnull().sum()

id               0
name             0
currency         0
launched_at      0
backers_count    0
country          0
status           0
usd_pledged      0
sub_category     0
main_category    0
blurb_length     0
goal_usd         0
city             0
duration         0
dtype: int64

In [14]:
# We have no null values, let's see if we have any duplicate values. 

# Let's focus on the id because that is our primary key
dup_id=df['id'].duplicated()==True
dup_id.sum()

24357

In [15]:
# Okay we have a lot of duplicate values. 

# Next let's check if we have any duplicate values for the name.

dup_name=df['name'].duplicated()==True
dup_name.sum()

24982

We actually have more names that are duplicated but that's okay as long as they have different id's as they represent two different campaigns. 

In [18]:
# Removing the duplicates for id's. 

df.drop_duplicates(subset='id',keep='first',inplace=True)
# Checking if we stil have duplicates
(df['id'].duplicated()==True).sum()

0

In [22]:
df.head()

Unnamed: 0,id,name,currency,launched_at,backers_count,country,status,usd_pledged,sub_category,main_category,blurb_length,goal_usd,city,duration
0,1714728788,Urban Farm Kits: Our Solution to City Farming,CAD,2018-02-10 22:47:52,16,CA,failed,577.902078,food,Farms,122,785.82065,Victoria,60.0
1,1639965256,Reflections on the LHC Safety Report,USD,2014-11-21 10:42:26,7,US,successful,211.0,publishing,Academic,35,93.0,San Diego,24.0
2,1582062943,Ultra-Custom Chocolate Bars,USD,2020-10-19 05:26:59,99,US,successful,4697.0,food,Small Batch,122,1500.0,Philadelphia,30.0
3,1888984372,GlassWeb: Internet Monitoring,USD,2019-05-29 16:01:34,3,US,failed,222.0,technology,Gadgets,69,25000.0,Chicago,60.0
4,1409770586,Dreaming of Streaming - 96.9 FM KMRD-LP Madrid...,USD,2015-11-19 20:36:01,100,US,successful,5410.0,publishing,Radio & Podcasts,131,5000.0,Madrid,30.0


In [27]:
# Let's create columns for the month and year that it was launched at

df['month']=df['launched_at'].apply(lambda x:x.split('-')[1])
df['launched_year']=df['launched_at'].apply(lambda x:x.split('-')[0])
df['month'].head()


0    02
1    11
2    10
3    05
4    11
Name: month, dtype: object

In [26]:
df['launched_year'].head()

0    2018
1    2014
2    2020
3    2019
4    2015
Name: launched_year, dtype: object

In [29]:
# Next let's round the values in the usd_pledged,goal_usd columns. 

df['usd_pledged']=round(df['usd_pledged'],1)
df['goal_usd']=round(df['goal_usd'],1)
df.head()

Unnamed: 0,id,name,currency,launched_at,backers_count,country,status,usd_pledged,sub_category,main_category,blurb_length,goal_usd,city,duration,month,launched_year
0,1714728788,Urban Farm Kits: Our Solution to City Farming,CAD,2018-02-10 22:47:52,16,CA,failed,577.9,food,Farms,122,785.8,Victoria,60.0,2,2018
1,1639965256,Reflections on the LHC Safety Report,USD,2014-11-21 10:42:26,7,US,successful,211.0,publishing,Academic,35,93.0,San Diego,24.0,11,2014
2,1582062943,Ultra-Custom Chocolate Bars,USD,2020-10-19 05:26:59,99,US,successful,4697.0,food,Small Batch,122,1500.0,Philadelphia,30.0,10,2020
3,1888984372,GlassWeb: Internet Monitoring,USD,2019-05-29 16:01:34,3,US,failed,222.0,technology,Gadgets,69,25000.0,Chicago,60.0,5,2019
4,1409770586,Dreaming of Streaming - 96.9 FM KMRD-LP Madrid...,USD,2015-11-19 20:36:01,100,US,successful,5410.0,publishing,Radio & Podcasts,131,5000.0,Madrid,30.0,11,2015


In [33]:
# Let's also create a new column representing the status of the campaign as either 1 if its successful or 
# 0 if it failed. 

df['success_yn']=df['status'].apply(lambda x:1 if x=='successful' else 0)
df['success_yn'].unique()

array([0, 1])

This concudes the data cleaning part of this project. 

This data was already pretty clean to star