In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go

# Set display options for better readability
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 50)

In [2]:
df = pd.read_csv("/content/pancreatic_cancer_prediction_sample.csv")

In [20]:
df.head()

Unnamed: 0,Country,Age,Gender,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,Abdominal_Discomfort,Back_Pain,Weight_Loss,Development_of_Type2_Diabetes,Stage_at_Diagnosis,Survival_Time_Months,Treatment_Type,Survival_Status,Alcohol_Consumption,Physical_Activity_Level,Diet_Processed_Food,Access_to_Healthcare,Urban_vs_Rural,Economic_Status
0,Canada,64,Female,0,0,0,0,0,0,0,0,0,0,0,Stage III,13,Surgery,0,0,Medium,Low,High,Urban,Low
1,South Africa,77,Male,1,1,0,0,0,0,0,0,0,0,1,Stage III,13,Chemotherapy,0,1,Medium,Medium,Medium,Urban,Low
2,India,71,Female,0,0,0,0,0,0,0,0,0,1,1,Stage IV,3,Chemotherapy,1,0,Medium,High,Low,Rural,Middle
3,Germany,56,Male,0,0,0,0,1,0,1,0,0,0,1,Stage IV,6,Radiation,0,1,Low,Low,Medium,Rural,Middle
4,United States,82,Female,0,0,0,0,1,0,0,0,0,0,0,Stage IV,9,Chemotherapy,1,0,Low,Medium,Medium,Rural,Low


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 24 columns):
 #   Column                         Non-Null Count  Dtype 
---  ------                         --------------  ----- 
 0   Country                        50000 non-null  object
 1   Age                            50000 non-null  int64 
 2   Gender                         50000 non-null  object
 3   Smoking_History                50000 non-null  int64 
 4   Obesity                        50000 non-null  int64 
 5   Diabetes                       50000 non-null  int64 
 6   Chronic_Pancreatitis           50000 non-null  int64 
 7   Family_History                 50000 non-null  int64 
 8   Hereditary_Condition           50000 non-null  int64 
 9   Jaundice                       50000 non-null  int64 
 10  Abdominal_Discomfort           50000 non-null  int64 
 11  Back_Pain                      50000 non-null  int64 
 12  Weight_Loss                    50000 non-null  int64 
 13  D

In [5]:
# Check for missing values
print("\nMissing Values in the Dataset:")
print(df.isnull().sum())


Missing Values in the Dataset:
Country                          0
Age                              0
Gender                           0
Smoking_History                  0
Obesity                          0
Diabetes                         0
Chronic_Pancreatitis             0
Family_History                   0
Hereditary_Condition             0
Jaundice                         0
Abdominal_Discomfort             0
Back_Pain                        0
Weight_Loss                      0
Development_of_Type2_Diabetes    0
Stage_at_Diagnosis               0
Survival_Time_Months             0
Treatment_Type                   0
Survival_Status                  0
Alcohol_Consumption              0
Physical_Activity_Level          0
Diet_Processed_Food              0
Access_to_Healthcare             0
Urban_vs_Rural                   0
Economic_Status                  0
dtype: int64


## Statistical Summary

In [6]:
df.describe()

Unnamed: 0,Age,Smoking_History,Obesity,Diabetes,Chronic_Pancreatitis,Family_History,Hereditary_Condition,Jaundice,Abdominal_Discomfort,Back_Pain,Weight_Loss,Development_of_Type2_Diabetes,Survival_Time_Months,Survival_Status,Alcohol_Consumption
count,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0,50000.0
mean,64.54094,0.29954,0.24826,0.19998,0.0993,0.15168,0.04944,0.19922,0.2965,0.25286,0.34998,0.19622,13.89804,0.12844,0.30346
std,9.973847,0.458061,0.432008,0.399989,0.299067,0.358714,0.216787,0.399418,0.456719,0.434656,0.476968,0.397141,11.272151,0.334582,0.459757
min,30.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
25%,58.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0
50%,65.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
75%,71.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,19.0,0.0,1.0
max,90.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,59.0,1.0,1.0


In [7]:
# Check unique values in categorical columns
print("\nUnique Values in Categorical Columns:")
for col in df.select_dtypes(include = ['object']).columns:
  print(f"{col}:{df[col].unique()}")


Unique Values in Categorical Columns:
Country:['Canada' 'South Africa' 'India' 'Germany' 'United States' 'Australia'
 'China' 'United Kingdom' 'Brazil']
Gender:['Female' 'Male']
Stage_at_Diagnosis:['Stage III' 'Stage IV' 'Stage II' 'Stage I']
Treatment_Type:['Surgery' 'Chemotherapy' 'Radiation']
Physical_Activity_Level:['Medium' 'Low' 'High']
Diet_Processed_Food:['Low' 'Medium' 'High']
Access_to_Healthcare:['High' 'Medium' 'Low']
Urban_vs_Rural:['Urban' 'Rural']
Economic_Status:['Low' 'Middle' 'High']


## Data Visualization

In [9]:
numerical_cols = df.select_dtypes(include=["int64","float64"]).columns
for col in numerical_cols:
  fig  = px.histogram(df, x=col,nbins = 20, title=f"Distribution of {col}")
  fig.show()

In [14]:
# Plot count plots for categorical columns using Plotly
categorical_cols = df.select_dtypes(include='object').columns

for col in categorical_cols:
    fig = px.bar(df[col].value_counts(), title=f"Count Plot of {col}")
    fig.update_layout(xaxis_title=col, yaxis_title="Count")
    fig.show()

In [25]:
# Aggregate data by country
country_data = df.groupby("Country").size().reset_index(name="Count")

# Create an interactive choropleth map
fig = px.choropleth(
    country_data,
    locations="Country",
    locationmode="country names",
    color="Count",
    title="Pancreatic Cancer Cases by Country",
    color_continuous_scale="picnic"
)
fig.show()

In [17]:
# Plot boxplots for numerical columns using Plotly
for col in numerical_cols:
    fig = px.box(df, y=col, title=f"Boxplot of {col}")
    fig.show()

In [18]:
# Example insights
insights = """
Key Insights:
1. The dataset contains records from multiple countries, with varying distributions of age, gender, and risk factors.
2. Correlation analysis reveals that certain risk factors (e.g., Smoking_History, Obesity) are strongly associated with pancreatic cancer stages.
3. Geospatial analysis highlights regions with higher incidence rates, suggesting potential environmental or lifestyle factors.
4. Outliers in survival time and age may warrant further investigation.

Recommendations:
1. Focus on high-risk populations (e.g., smokers, obese individuals) for targeted prevention programs.
2. Conduct further research into environmental factors in regions with high incidence rates.
3. Address missing data systematically to improve dataset quality.
"""
print(insights)


Key Insights:
1. The dataset contains records from multiple countries, with varying distributions of age, gender, and risk factors.
2. Correlation analysis reveals that certain risk factors (e.g., Smoking_History, Obesity) are strongly associated with pancreatic cancer stages.
3. Geospatial analysis highlights regions with higher incidence rates, suggesting potential environmental or lifestyle factors.
4. Outliers in survival time and age may warrant further investigation.

Recommendations:
1. Focus on high-risk populations (e.g., smokers, obese individuals) for targeted prevention programs.
2. Conduct further research into environmental factors in regions with high incidence rates.
3. Address missing data systematically to improve dataset quality.

