Step 0 - import packages

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Step 1 - create pandas dataframe 

In [2]:
df = pd.read_csv("meteorites.csv")

In [3]:
df.head()

Unnamed: 0,name,id,nametype,recclass,mass (g),fall,year,reclat,reclong,GeoLocation
0,Aachen,1,Valid,L5,21.0,Fell,1880.0,50.775,6.08333,"(50.775, 6.08333)"
1,Aarhus,2,Valid,H6,720.0,Fell,1951.0,56.18333,10.23333,"(56.18333, 10.23333)"
2,Abee,6,Valid,EH4,107000.0,Fell,1952.0,54.21667,-113.0,"(54.21667, -113.0)"
3,Acapulco,10,Valid,Acapulcoite,1914.0,Fell,1976.0,16.88333,-99.9,"(16.88333, -99.9)"
4,Achiras,370,Valid,L6,780.0,Fell,1902.0,-33.16667,-64.95,"(-33.16667, -64.95)"


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45716 entries, 0 to 45715
Data columns (total 10 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   name         45716 non-null  object 
 1   id           45716 non-null  int64  
 2   nametype     45716 non-null  object 
 3   recclass     45716 non-null  object 
 4   mass (g)     45585 non-null  float64
 5   fall         45716 non-null  object 
 6   year         45425 non-null  float64
 7   reclat       38401 non-null  float64
 8   reclong      38401 non-null  float64
 9   GeoLocation  38401 non-null  object 
dtypes: float64(4), int64(1), object(5)
memory usage: 3.5+ MB


In [5]:
df['fall'].value_counts()

fall
Found    44609
Fell      1107
Name: count, dtype: int64

Step 2 - Clean the Data
-In the 'fall' column, I only want to look at 'fell' and not 'found'. I need to remove all of the files with 'found'.
-Then the there are some columns I don't need: nametype, reclat, reclong, GeoLocation
-Lastly, the year is in the wrong format. It should be an int, not a float. 


In [13]:
#filter out any value of "Found"
meteorites_df = df[df.fall == "Fell"]
meteorites_df['fall'].value_counts()

fall
Fell    1107
Name: count, dtype: int64

In [14]:
#remove columns: nametype, reclat, reclong, GeoLocation
meteorites_df = meteorites_df.drop(columns=['nametype', 'reclat', 'reclong', 'GeoLocation'])
meteorites_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1107 entries, 0 to 1110
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      1107 non-null   object 
 1   id        1107 non-null   int64  
 2   recclass  1107 non-null   object 
 3   mass (g)  1075 non-null   float64
 4   fall      1107 non-null   object 
 5   year      1107 non-null   float64
dtypes: float64(2), int64(1), object(3)
memory usage: 60.5+ KB


In [15]:
# change year type to int
meteorites_df['year'] = meteorites_df['year'].astype(int)
meteorites_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1107 entries, 0 to 1110
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      1107 non-null   object 
 1   id        1107 non-null   int64  
 2   recclass  1107 non-null   object 
 3   mass (g)  1075 non-null   float64
 4   fall      1107 non-null   object 
 5   year      1107 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(3)
memory usage: 56.2+ KB


In [16]:
# Any null values?
print(meteorites_df.isna().sum())

name         0
id           0
recclass     0
mass (g)    32
fall         0
year         0
dtype: int64


In [22]:
#drop null values for mass and rename column
meteorites_df.rename(columns={'mass (g)':'mass'}, inplace=True)
meteorites_df = meteorites_df.dropna()
print(meteorites_df.isna().sum())

name        0
id          0
recclass    0
mass        0
fall        0
year        0
dtype: int64


In [24]:
#double check to make sure everything is working correctly. 
meteorites_df.info()
meteorites_df.head()

<class 'pandas.core.frame.DataFrame'>
Index: 1075 entries, 0 to 1110
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      1075 non-null   object 
 1   id        1075 non-null   int64  
 2   recclass  1075 non-null   object 
 3   mass      1075 non-null   float64
 4   fall      1075 non-null   object 
 5   year      1075 non-null   int32  
dtypes: float64(1), int32(1), int64(1), object(3)
memory usage: 54.6+ KB


Unnamed: 0,name,id,recclass,mass,fall,year
0,Aachen,1,L5,21.0,Fell,1880
1,Aarhus,2,H6,720.0,Fell,1951
2,Abee,6,EH4,107000.0,Fell,1952
3,Acapulco,10,Acapulcoite,1914.0,Fell,1976
4,Achiras,370,L6,780.0,Fell,1902
