In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('sample_crimes_data.csv')

In [None]:
df.head()         

In [None]:
df.info()        

In [None]:
print(df.columns.tolist())

In [None]:
print(df.shape)

In [None]:
print (df)

In [None]:
df.describe()    

In [None]:
df = df.drop_duplicates()

In [None]:
print(f"Nombre de lignes : {df.shape[0]}")
print(f"Nombre de colonnes : {df.shape[1]}")

In [None]:
df = pd.read_csv('sample_crimes_data.csv')
# Nettoyage
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [None]:
print("Données manquantes par colonne :")
print(df.isnull().sum())

In [None]:
print("Top 05 types de crimes :")
print(df['crm_cd_desc'].value_counts().head(5))

In [None]:
print("Top 05 zones géographiques :")
print(df['area_name'].value_counts().head(5))

In [None]:
print(df['date_occ'].head())

In [None]:
df['date_occ'] = pd.to_datetime(df['date_occ'], errors='coerce')

In [None]:
df['Year'] = df['date_occ'].dt.year
df['Month'] = df['date_occ'].dt.month

In [None]:
crimes_by_year = df['Year'].value_counts().sort_index()
plt.figure(figsize=(5,5))
sns.barplot(x=crimes_by_year.index, y=crimes_by_year.values)
plt.title("Crimes par année")
plt.xlabel("Année")
plt.ylabel("Nbre de crimes")
plt.tight_layout()
plt.show()

In [None]:
# Crimes par mois 
crimes_by_month = df['Month'].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=crimes_by_month.index, y=crimes_by_month.values)
plt.title("Crimes par mois")
plt.xlabel("Mois")
plt.ylabel("Nbre de crimes")
plt.tight_layout()
plt.show()

In [None]:
# Heatmap crimes par mois et année
crimes_pivot = df.pivot_table(index='Year', columns='Month', values='dr_no', aggfunc='count')
plt.figure(figsize=(10,6))
sns.heatmap(crimes_pivot, cmap="YlGnBu", annot=True, fmt="g")
plt.title("Heatmap des crimes par mois et année")
plt.xlabel("Mois")
plt.ylabel("Année")
plt.tight_layout()
plt.show()


In [None]:
!pip install happybase
import happybase
connection = happybase.Connection('hbase')
connection.open()


In [None]:
df = pd.read_csv('sample_crimes_data.csv')
# Nettoyage
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')


In [None]:
df

In [None]:
df['occurred'] = pd.to_datetime(df['date_occ'], format="%m/%d/%Y %I:%M:%S %p").dt.strftime("%Y%m%d")
df['reported'] = pd.to_datetime(df['date_rptd'], format="%m/%d/%Y %I:%M:%S %p").dt.strftime("%Y%m%d")

In [None]:
# Convertir les dates en datetime
df["date_occ"] = pd.to_datetime(df["date_occ"], errors="coerce")
df["date_rptd"] = pd.to_datetime(df["date_rptd"], errors="coerce")

In [None]:
# Rowkeys
df['rowkey'] = df['occurred'] + "_" + df['dr_no'].astype(str)

In [None]:
catalog = {
    'location': ['location', 'cross_street', 'lat', 'lon', 'area_name', 'premis_desc'],
    'crime_info': ['crm_cd_desc', 'weapon_desc', 'status_desc', 'part_1-2', 'vict_age', 'vict_sex', 'vict_descent', 'time_occ', 'mocodes']
}

In [None]:
import happybase

connection = happybase.Connection(host='hbase')
connection.open()

table = connection.table('practice:crimes')

def ingestion(table, df):
    with table.batch(batch_size=1000) as batch:
        for _, row in df.iterrows():
            rowkey = str(row['rowkey']).encode('utf-8')
            hbase_data = {}

            for cf, cols in catalog.items():
                for col in cols:
                    if pd.notna(row[col]):
                        qualifier = f"{cf}:{col}".encode('utf-8')
                        value = str(row[col]).encode('utf-8')
                        hbase_data[qualifier] = value

            batch.put(rowkey, hbase_data)

    print("Inserted!")

In [None]:
ingestion(table, df.head(500000))

In [None]:
print("Lignes totales dans la table des crimes :", len(df))

In [None]:
#All crimes in Hollywood in 2020
crimes_hollywood= df[
    (df["area_name"].str.upper() == "HOLLYWOOD") &
    (df["date_occ"].dt.year == 2020)]

In [None]:
crimes_hollywood.head(10)

In [None]:
print("Crimes à Hollywood en 2020 :", len(crimes_hollywood_2020))

In [None]:
# All SHOPLIFTING and VANDALISM crimes (if the label of the crime contains it) in February 2020

crimes_2020 = df[
    df["crm_cd_desc"].str.contains("SHOPLIFTING|VANDALISM", case=False, na=False) &
    (df["date_occ"].dt.month == 2) &
    (df["date_occ"].dt.year == 2020)
]
print("SHOPLIFTING ou VANDALISM en février 2020 :", len(crimes_2020))
print(crimes_2020.head(10))

In [None]:
#Victim age and sex for crimes of INTIMATE PARTNER - SIMPLE ASSAULT (exact match) in April 2020
victs= df[
    (df["crm_cd_desc"] == "intimate patner simple assault") &
    (df["date_occ"].dt.month == 4) &
    (df["date_occ"].dt.year == 2020)]

victim_info= victs[["vict_age", "vict_sex"]]
print("Victimes:", len(victs))
print(victim_info.head(10))

In [None]:
# Crimes reported in 03/12/2020 12:00:00 AM
reported = df[
    df["date_rptd"] == pd.to_datetime("03/12/2020 12:00:00 AM")
]
print("Crimes reportés le 03/12/2020 :", len(reported))
print(reported.head(10))



In [None]:
#Crimes occuring between 02/01/2020 12:00:00 AM and 02/02/2020 12:00:00 AM, in Wilshire on female victims.
filtered_crimes = df[
    (df["date_occ"] >= pd.to_datetime("02/01/2020 12:00:00 AM")) &
    (df["date_occ"] <= pd.to_datetime("02/02/2020 12:00:00 AM")) &
    (df["area_name"].str.upper() == "WILSHIRE") &
    (df["vict_sex"].str.upper() == "F")
]
print("Crimes sur les femmes entre 01/02/2020 et 02/02/2020 à Wilshire :", len(filtered_crimes))
print(filtered_crimes.head(10))