In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Loading the dataset from the URL
url = 'https://data.cityofchicago.org/resource/dqcy-ctma.csv?$limit=5000000'
df = pd.read_csv(url)

# Displaying basic information about the dataset
print("Dataset Information:")
df.info()

# Displaying the first 5 rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Handling missing values before processing
print("\nMissing values before cleaning:")
print(df.isnull().sum())

# Dropping duplicates
df.drop_duplicates(inplace=True)

# Converting 'date' column to datetime
df['date'] = pd.to_datetime(df['date'], errors='coerce')

# Keeping only relevant columns: longitude, latitude, and primary type of crime
df = df[['longitude', 'latitude', 'primary_type']]

# Dropping rows with missing longitude or latitude
df = df.dropna(subset=['longitude', 'latitude'])

# Identifying the top 10 most common crime types
top_10_crimes = df['primary_type'].value_counts().nlargest(10).index

# Filtering the dataset to include only rows with the top 10 crime types
df = df[df['primary_type'].isin(top_10_crimes)]

# Resetting the index after filtering
df.reset_index(drop=True, inplace=True)

# Final check for missing values
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Displaying summary statistics after cleaning
print("\nSummary statistics:")
print(df.describe())

# Displaying the top 5 rows after cleaning
print("\nTop 5 rows after cleaning:")
print(df.head())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 189387 entries, 0 to 189386
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    189387 non-null  int64  
 1   case_number           189387 non-null  object 
 2   date                  189387 non-null  object 
 3   block                 189387 non-null  object 
 4   iucr                  189387 non-null  object 
 5   primary_type          189387 non-null  object 
 6   description           189387 non-null  object 
 7   location_description  188632 non-null  object 
 8   arrest                189387 non-null  bool   
 9   domestic              189387 non-null  bool   
 10  beat                  189387 non-null  int64  
 11  district              189387 non-null  int64  
 12  ward                  189387 non-null  int64  
 13  community_area        189387 non-null  int64  
 14  fbi_code              189387 no

In [2]:
# Importing necessary libraries
import pandas as pd
import numpy as np

# Loading the dataset from the URL
url = 'https://data.seattle.gov/resource/tazs-3rd5.csv?$limit=5000000'
df = pd.read_csv(url)

# Displaying basic information about the dataset
print("Dataset Information:")
df.info()

# Displaying the first 5 rows of the dataset
print("\nFirst 5 rows of the dataset:")
print(df.head())

# Handling missing values before processing
print("\nMissing values before cleaning:")
print(df.isnull().sum())

# Dropping duplicates
df.drop_duplicates(inplace=True)

# Converting datetime columns to datetime objects
df['report_datetime'] = pd.to_datetime(df['report_datetime'], errors='coerce')
df['offense_start_datetime'] = pd.to_datetime(df['offense_start_datetime'], errors='coerce')

# Handling missing values in datetime columns
df = df[df['report_datetime'].notnull()]
df = df[df['offense_start_datetime'].notnull()]

# Keeping only relevant columns: longitude, latitude, and offense type
df = df[['longitude', 'latitude', 'offense']]

# Dropping rows with missing longitude or latitude
df = df.dropna(subset=['longitude', 'latitude'])

# Identifying the top 10 most common crime types (offense)
top_10_crimes = df['offense'].value_counts().nlargest(10).index

# Filtering the dataset to include only rows with the top 10 crime types
df = df[df['offense'].isin(top_10_crimes)]

# Resetting the index after filtering
df.reset_index(drop=True, inplace=True)

# Final check for missing values
print("\nMissing values after cleaning:")
print(df.isnull().sum())

# Displaying summary statistics after cleaning
print("\nSummary statistics:")
print(df.describe())

# Displaying the top 5 rows after cleaning
print("\nTop 5 rows after cleaning:")
print(df.head())


Dataset Information:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1146937 entries, 0 to 1146936
Data columns (total 17 columns):
 #   Column                  Non-Null Count    Dtype  
---  ------                  --------------    -----  
 0   report_number           1146937 non-null  object 
 1   offense_id              1146937 non-null  int64  
 2   offense_start_datetime  1145059 non-null  object 
 3   offense_end_datetime    661211 non-null   object 
 4   report_datetime         1146937 non-null  object 
 5   group_a_b               1146937 non-null  object 
 6   crime_against_category  1146937 non-null  object 
 7   offense_parent_group    1146937 non-null  object 
 8   offense                 1146937 non-null  object 
 9   offense_code            1146937 non-null  object 
 10  precinct                1146927 non-null  object 
 11  sector                  1146927 non-null  object 
 12  beat                    1146927 non-null  object 
 13  mcpp                    1146907 non-