# Exploring with Visuals
- 2008: extract int from string
- 2018: convert float to int

Load datasets `data_08_v2.csv` and `data_18_v2.csv`. You should've created these data files in the previous section: *Filter, Drop Nulls, Dedupe*.

In [None]:
import pandas as pd

In [None]:
# dataset url
url_08 = ('https://raw.githubusercontent.com/bentegviz/udacity_intro_to_data_analysis/main/Case%20Study%202/data/processed/all_alpha_08.csv')
url_18 = ('https://raw.githubusercontent.com/bentegviz/udacity_intro_to_data_analysis/main/Case%20Study%202/data/processed/all_alpha_18.csv')

In [None]:
# csv to dataset
df_08 = pd.read_csv(url_08)
df_18 = pd.read_csv(url_18)
print('Load CSV Complete')

In [None]:
# check value counts for the 2008 cyl column
df_08['Cyl'].value_counts()

In [None]:
# view missing value count for each feature in 2008
df_08.isnull().sum()

In [None]:
# view missing value count for each feature in 2018
df_18.isnull().sum()

In [None]:
# drop rows with any null values in both datasets
df_08.dropna(inplace=True)
df_18.dropna(inplace=True)

In [None]:
# checks if any of columns in 2008 have null values - should print False
df_08.isnull().sum().any()

In [None]:
# checks if any of columns in 2018 have null values - should print False
df_18.isnull().sum().any()

In [None]:
# print number of duplicates in 2008 and 2018 datasets
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

In [None]:
# drop duplicates in both datasets
df_08.drop_duplicates(inplace=True)
df_18.drop_duplicates(inplace=True)

In [None]:
# print number of duplicates in 2008 and 2018 datasets
print(df_08.duplicated().sum())
print(df_18.duplicated().sum())

Read [this](https://stackoverflow.com/questions/35376387/extract-int-from-string-in-pandas) to help you extract ints from strings in Pandas for the next step.

Fix Cyl to Int

In [None]:
# Extract int from strings in the 2008 cyl column

# df['B'].str.extract('(\d+)').astype(int)

df_08['Cyl'] = df_08['Cyl'].str.extract('(\d+)').astype(int)

In [None]:
# Check value counts for 2008 cyl column again to confirm the change
df_08['Cyl'].value_counts()

In [None]:
# convert 2018 cyl column to int
df_18['Cyl'] = df_18['Cyl'].astype('int64')

In [None]:
df_18['Cyl'].value_counts()

Fix Air Pollution to Float

In [None]:
print(df_08['Air Pollution Score'].value_counts())
print(df_08['Air Pollution Score'].value_counts())


In [None]:
# Change Air Pollition Score to Float
df_08['Air Pollution Score'] = df_08['Air Pollution Score'].str.extract('(\d+)').astype(float)
df_18['Air Pollution Score'] = df_18['Air Pollution Score'].astype('float')


In [None]:
print(df_08['Air Pollution Score'].dtypes)
print(df_18['Air Pollution Score'].dtypes)

Fix MPG to Float

In [None]:
# Change Air Pollition Score to Float
df_08['City MPG'] = df_08['City MPG'].str.extract('(\d+)').astype(float)
df_08['Hwy MPG'] = df_08['Hwy MPG'].str.extract('(\d+)').astype(float)
df_08['Cmb MPG'] = df_08['Cmb MPG'].str.extract('(\d+)').astype(float)

df_18['City MPG'] = df_18['City MPG'].str.extract('(\d+)').astype(float)
df_18['Hwy MPG'] = df_18['Hwy MPG'].str.extract('(\d+)').astype(float)
df_18['Cmb MPG'] = df_18['Cmb MPG'].str.extract('(\d+)').astype(float)

In [None]:
print(df_08.dtypes)
print(df_18.dtypes)

Fix Greenhouse Gas to Int

In [None]:
# Extract int from strings in the 2008 cyl column

# df['B'].str.extract('(\d+)').astype(int)

df_08['Greenhouse Gas Score'] = df_08['Greenhouse Gas Score'].str.extract('(\d+)').astype(int)
df_18['Greenhouse Gas Score'] = df_18['Greenhouse Gas Score'].str.extract('(\d+)').astype(int)

In [None]:
print(df_08['Greenhouse Gas Score'].dtypes)
print(df_18['Greenhouse Gas Score'].dtypes)

Export CSV

In [None]:
from google.colab import files
df_08.to_csv('data_08_v3.csv', index=False)
files.download('data_08_v3.csv')

In [None]:
df_18.to_csv('data_18_v3.csv', index=False)
files.download('data_18_v3.csv')