In [2]:
import pandas as pd

# Load the dataset
data = pd.read_csv('districtwise-cyber-crimes-2017-onwards.csv')


In [None]:
data.head()

In [None]:
# Create a DataFrame for all the crimes
remaining_columns = [
        'tampering_computer_source_documents', 'ransom_ware', 'offences_other_than_ransom_ware',
        'dishonestly_recv_stolen_cmp_resrc_or_comm_device', 'identity_theft', 
        'cheating_by_personation_by_using_computer_resource', 'violation_of_privacy', 
        'cyber_terrorism', 'pub_or_trans_obscene_material_in_electronic_form', 
        'pub_or_trans_of_mtrl_cont_sxly_explct_act_in_elect_form', 
        'pub_or_trans_matrl_dpctng_chldrn_sxly_explct_elect_form',
        'presrv_and_retention_of_info_by_intermediaries', 'other_sections_it_act',
        'interception_or_monitoring_or_decryption_of_info', 
        'un_athryz_access_atmpt_access_prct_comp_sys', 
        'abetment_to_commit_offences', 'attempt_to_commit_offences',
        'other_sections_of_it_act', 'abetment_of_suicide_online',
        'cyber_stalking_bullying_of_women_children', 'data_theft',
        'credit_card_debit_card_fraud', 'atms_fraud', 
        'online_banking_fraud', 'otp_frauds', 'other_frauds',
        'cheating', 'forgery', 'defamation_morphing', 
        'fake_profile', 'currency_counterfeiting',
        'stamps_counterfeiting', 'cyber_blackmailing_threatening',
        'fake_news_on_social_media', 'other_offences'
    ]
all_crimes_df = data[remaining_columns]
# Display the first few rows of the DataFrame
all_crimes_df.head()

In [None]:
# Select the columns to correlate with
columns_to_correlate = ['year', 'state_name', 'district_name', 'registration_circles']

# Calculate the correlation
correlation_results = {}
for col in columns_to_correlate:
    correlation_results[col] = all_crimes_df.corrwith(data[col].astype('category').cat.codes)

# Print the correlation results
for col, corr in correlation_results.items():
    print(f"Correlation with {col}:")
    print(corr)
    print()

In [None]:
# Display basic information
print(data.info())

# Display summary statistics
print(data.describe())

In [None]:
# Check for missing values
print(data.isnull().sum())

# Fill or drop missing values as necessary
# data.fillna(0, inplace=True)  # Example: filling missing values with 0

In [None]:
# Unique states and districts
print(len(data['state_name'].unique()))
print(len(data['district_name'].unique()))

In [None]:
import matplotlib.pyplot as plt

# Group by year and sum total offences
yearly_data = data.groupby('year')['total_offences_under_ip'].sum()

plt.figure(figsize=(10,5))
yearly_data.plot(kind='bar')
plt.title('Total Cyber Crimes Over Years')
plt.xlabel('Year')
plt.ylabel('Total Offences')
plt.show()

In [None]:
# Filter data for a specific year (e.g., 2019)
data_2019 = data[data['year'] == 2019]

d = pd.DataFrame(data_2019.groupby('state_name')['total_offences_under_ip'].sum())
d = d.sort_values(by='total_offences_under_ip', ascending=True)
final_data = d[d['total_offences_under_ip'] > 5]

plt.figure(figsize=(12,6))
# final_data.plot(kind='xhbar')
plt.barh(final_data.index, final_data['total_offences_under_ip'])
plt.title('Cyber Crimes by State in 2019')
plt.xlabel('State',rotation=90)
plt.ylabel('Total Offences')
#plt.xticks(rotation=45)
plt.show()


In [None]:

# Prepare data for time series analysis
time_series_data = yearly_data.reset_index()
time_series_data.columns = ['Year', 'Total_Offences']
print(time_series_data)
time_series_data.head()
import seaborn as sns   
sns.scatterplot(x='Year', y='Total_Offences', data=time_series_data)

Support Vector Regressor (SVR)

In [None]:
import numpy as np
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler

# Prepare the features (X) and the target variable (y)
X = np.array(time_series_data['Year']).reshape(-1, 1)
y = np.array(time_series_data['Total_Offences'])

# Standardize the features
scaler_X = StandardScaler()
scaler_y = StandardScaler()

X_scaled = scaler_X.fit_transform(X)
y_scaled = scaler_y.fit_transform(y.reshape(-1, 1)).flatten()

# Create and fit the SVR model
model = SVR(kernel='rbf', C=100, gamma=0.1, epsilon=0.1)
model.fit(X_scaled, y_scaled)

# Predict the total offences for the next eight years (2022 to 2030)
future_years = np.array([2023, 2024, 2025, 2026, 2027, 2028, 2029, 2030]).reshape(-1, 1)
future_years_scaled = scaler_X.transform(future_years)
predictions_scaled = model.predict(future_years_scaled)
predictions = scaler_y.inverse_transform(predictions_scaled.reshape(-1, 1)).flatten()

# Print the predictions
for year, prediction in zip(future_years.flatten(), predictions):
    print(f"Predicted total offences in {year}: {prediction:.2f}")

In [None]:
# Group by year and sum total offences
grouped_data = data.groupby('year')['total_offences_under_ip'].sum().reset_index()

# Create future data
future_data = pd.DataFrame({
	'year': future_years.flatten(),
	'total_offences_under_ip': predictions
})

# Combine with future data
combined_data = pd.concat([grouped_data, future_data]).reset_index(drop=True)

# Print the combined data
print(combined_data)

In [None]:
# Convert yearly_data to a DataFrame
yearly_data_df = yearly_data.reset_index()
yearly_data_df.columns = ['year', 'total_offences_under_ip']

# Combine historical and predicted data for plotting
combined_data = pd.concat([yearly_data_df, future_data])

plt.figure(figsize=(12, 6))
plt.plot(yearly_data_df['year'], yearly_data_df['total_offences_under_ip'], marker='o', label='Historical Data', color='blue')
plt.plot(future_data['year'], future_data['total_offences_under_ip'], marker='o', linestyle='--', label='Predicted Data', color='orange')

plt.title('Total Cyber Crimes Over Years with Predictions')
plt.xlabel('Year')
plt.ylabel('Total Offences')
plt.xticks(np.arange(2017, 2031, 1))
plt.legend()
plt.grid()
plt.show()

In [None]:
# Calculate the percentage change year over year
combined_data['pct_change'] = combined_data['total_offences_under_ip'].pct_change() * 100

# Display the data with percentage change
print(combined_data)

In [None]:
colors = ['red' if pct > 0 else 'green' for pct in combined_data['pct_change'].fillna(0)]

plt.figure(figsize=(12, 6))
plt.bar(combined_data['year'], combined_data['total_offences_under_ip'], color=colors)
plt.title('Total Cyber Crimes Over Years with Predictions')
plt.xlabel('Year')
plt.ylabel('Total Offences')
plt.xticks(np.arange(2017, 2031, 1), rotation=45)
plt.grid(axis='y')
plt.show()