# Exploratory Data Analysis of Meteorite landings on Earth recorded by NASA

Project 1 dataset link: https://catalog.data.gov/dataset/meteorite-landings

Lets import the libraries needed for data loading, data analysis and visualization

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import geopandas as gpd
!pip install cartopy
import cartopy.crs as ccrs
import cartopy.feature as cfeature
import folium
from geopy.geocoders import Nominatim

Collecting cartopy
  Downloading Cartopy-0.23.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m30.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: cartopy
Successfully installed cartopy-0.23.0


Checking the versions of modules imported

In [4]:
print(pd.__version__)
print(np.__version__)

2.0.3
1.25.2


In [None]:
df=pd.read_csv('Meteorite_Landings.csv')
#print(df.head())
df

Few of the headers in the given dataset are not that convicing the meaning of the values, lets try to recreate new headers as per our understanding

In [None]:
# Read the CSV file without considering the first row as headers
df1 = pd.read_csv('Meteorite_Landings.csv', skiprows=1)

In [None]:
# Define new headers
headers_list = ['Meteor_name', 'Meteor_ID', 'Meteor_Type',
                'Meteor_class', 'Meteor_mass','Meteor_fall/found',
                'Meteor_year','latitude', 'longitude','Meteor_geolocation'
               ]  # Replace with your new headers

# Add new headers to the DataFrame
df1.columns = headers_list

# Write the DataFrame back to the CSV file without headers
df1.to_csv('Meteorite_Landings.csv', index=False, header=False)

In [None]:
df1.dtypes #display the datatypes of every column in the dataset

In [None]:
df1

In [None]:
df1.describe()
#this description might not be helpful for our dataset as there is no point of calculation of mean, median values of the data

In [None]:
print(df1.isnull().sum())
#this non zero values tell the count of empty cells in the given data.

In [None]:
rows_with_empty_values = df1[df1.isnull().any(axis=1)]
print(rows_with_empty_values)
#this line helps us to show the rows having the null values in their records

In [None]:
plt.figure(figsize=(5, 6))
sns.histplot(df1['Meteor_mass'].dropna(), bins=20, kde=True)
plt.xlabel('Meteorite Mass (grams)')
plt.ylabel('Frequency')
plt.title('Distribution of Meteorite Mass')
plt.show()

In [None]:
meteor_counts = df1.groupby('Meteor_year').size()

# Plotting the graph
plt.figure(figsize=(10, 6))
plt.plot(meteor_counts.index, meteor_counts.values, marker='*', linestyle='-')
plt.xlabel('Year of Discovery')
plt.ylabel('Count of Meteors')
plt.title('Count of Meteors by Year of Discovery')
plt.grid(True)
#plt.xticks(range(min(meteor_counts.index), max(meteor_counts.index)+1, 50))
plt.show()

In [None]:
df1 = df1.dropna(subset=['Meteor_year'])
df1 = df1[~df1['Meteor_year'].isin([float('inf'), float('-inf')])]

# Convert 'Meteor_year' column to integer
df1['Meteor_year'] = df1['Meteor_year'].astype(int)

# Create a DataFrame with a continuous sequence of years
min_year = df1['Meteor_year'].min()
max_year = df1['Meteor_year'].max()
all_years = pd.DataFrame({'Meteor_year': range(min_year, max_year + 1)})

# Group by year and count the number of meteors found in each year
meteor_counts = df1.groupby('Meteor_year').size().reindex(all_years['Meteor_year'], fill_value=0)

# Plotting the graph
plt.figure(figsize=(10, 6))
plt.plot(meteor_counts.index, meteor_counts.values, marker='o', linestyle='-')
plt.xlabel('Year of Discovery')
plt.ylabel('Count of Meteors')
plt.title('Count of Meteors by Year of Discovery')
plt.grid(True)
plt.xticks(meteor_counts.index, rotation=45)  # Rotate x-axis labels for better readability
plt.show()

In [None]:
df_sorted = df1.sort_values(by='Meteor_year')

# Retrieve the first row (i.e., the earliest discovery)
first_meteor = df_sorted.head(1)

# Print the details of the first meteor discovery
print("Details of the first meteor discovery as per the given dataset:")
first_meteor

In [None]:
heaviest_meteor = df1.loc[df1['Meteor_mass'].idxmax()]
print("The heaviest meteor fell on Earth:")
print(heaviest_meteor)

In [None]:
bin_width = int((df1['Meteor_mass'].max() - df1['Meteor_mass'].min()) / 50)

plt.figure(figsize=(10, 10))
sns.histplot(df1['Meteor_mass'].dropna(), bins=range(int(df1['Meteor_mass'].min()), int(df1['Meteor_mass'].max()) + bin_width, bin_width), kde=False)
#sns.histplot(df1['Meteor_mass'].dropna(), bins=10, kde=False)
plt.xlabel('Meteorite Mass (grams)')
plt.ylabel('Frequency')
plt.title('Distribution of Meteorite Mass')
plt.show()

In [None]:
df1.hist(figsize=(10, 8))
plt.tight_layout()
plt.show()

In [None]:
#this code displays the world map where the meteor occureence recorded. I have also added advanced visualization of worldmap in below lines of code.

gdf = gpd.GeoDataFrame(df1, geometry=gpd.points_from_xy(df1['longitude'], df1['latitude']))

# Plot the points in the world map
fig, ax = plt.subplots(figsize=(12,12),subplot_kw={'projection': ccrs.PlateCarree()})
ax.add_feature(cfeature.COASTLINE)
ax.add_feature(cfeature.BORDERS, linestyle=':')
ax.set_extent([-180, 180, -90, 90])

gdf.plot(ax=ax, marker='*', color='red', markersize=1, alpha=0.7)

plt.show()


The below lines of code creates a html page to visualize the points on world map where the meteor data is recorded

In [None]:
df1_clean = df1.dropna(subset=['latitude', 'longitude'])

# Create the map
m = folium.Map(location=[df1_clean['latitude'].mean(), df1_clean['longitude'].mean()], zoom_start=1)

# Add markers
for _, row in df1_clean.iterrows():
    folium.Marker([row['latitude'], row['longitude']], popup=row['Meteor_name']).add_to(m)

# Save the map
m.save('meteor_landings_map.html')

The html web page when opened in the browser shows the visualization of meteor fell places on world map. I have attached the snips below for reference.
https://umbc.box.com/s/xyx9k5rmgpe1lhgm0lta88az29hc7xf5
https://umbc.box.com/s/igm2fa6t0sb5lj18ui5lkltuotfcr6u6

In [None]:
#initialize the geolocator
#geolocator = Nominatim(user_agent="MyGeocoder")

# Define a function to get country from latitude and longitude
#def get_country(lat, lon):
    #try:
        # Get location information
       # location = geolocator.reverse([lat, lon], exactly_one=True)
        #address = location.raw['address']
        #country = address.get('country', '')
        #return country
    #except:
        # Return None if no country found
        #return None

# Create a new column 'country' using the latitude and longitude columns
#df1['country'] = df1.apply(lambda row: get_country(row['latitude'], row['longitude']), axis=1)

In [None]:
As the dataset is large the creation of country column takes long time to run so I commented it above

In [None]:
#let me create a histrogram displaying the frequency of meteorites in latitude and longitudes given in the dataset
# Remove rows with NaN values in 'latitude' or 'longitude' columns
df1_clean = df1.dropna(subset=['latitude', 'longitude'])

# Compute bounding box based on non-NaN values
boundingBox = {
    "lat": {"min": df1_clean['latitude'].min(), "max": df1_clean['latitude'].max()},
    "lon": {"min": df1_clean['longitude'].min(), "max": df1_clean['longitude'].max()}
}

# Define the step size for binning
step = 50  # Adjust as needed

# Calculate the number of bins
noOfLatEdges = int((boundingBox["lat"]["max"] - boundingBox["lat"]["min"]) / step)
noOfLonEdges = int((boundingBox["lon"]["max"] - boundingBox["lon"]["min"]) / step)

# Compute the 2D histogram
H, xedges, yedges = np.histogram2d(df1_clean.latitude, df1_clean.longitude, bins=[noOfLatEdges, noOfLonEdges])
binnedData = H.T

# Display the 2D histogram
plt.imshow(binnedData, extent=[xedges[0], xedges[-1], yedges[0], yedges[-1]], origin='lower', aspect='auto')
plt.colorbar(label='Frequency')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('2D Histogram of Latitude and Longitude')
plt.show()
