In [2]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('diabetes.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


In [4]:
df.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [6]:
# Display basic dataset information
print("First 5 rows of the dataset:\n", df.head())
print("\nDataset Info:\n")
df.info()
print("\nSummary Statistics:\n", df.describe())

# Check for missing values
print("\nMissing values in the dataset:\n", df.isnull().sum())

# Display the column names
print("\nColumn Names:\n", df.columns)

# Check for duplicate rows
print("\nNumber of duplicate rows:\n", df.duplicated().sum())

# Distribution of the target variable
print("\nTarget Variable Distribution:\n", df['Outcome'].value_counts())

# Splitting features and target variable
X = df.drop('Outcome', axis=1)  # Features
y = df['Outcome']              # Target variable

# Train-test split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model: Random Forest Classifier
model = RandomForestClassifier(random_state=42)
model.fit(X_train, y_train)

# Predictions and evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("\nModel Accuracy:", accuracy)
print("\nClassification Report:\n", classification_report(y_test, y_pred))

First 5 rows of the dataset:
    Pregnancies  Glucose  BloodPressure  SkinThickness  Insulin   BMI  \
0            6      148             72             35        0  33.6   
1            1       85             66             29        0  26.6   
2            8      183             64              0        0  23.3   
3            1       89             66             23       94  28.1   
4            0      137             40             35      168  43.1   

   DiabetesPedigreeFunction  Age  Outcome  
0                     0.627   50        1  
1                     0.351   31        0  
2                     0.672   32        1  
3                     0.167   21        0  
4                     2.288   33        1  

Dataset Info:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null

In [7]:
import pandas as pd
import numpy as np

# Sample dataset
data = {
    "Name": ["Alice", "Bob", "Charlie", "Diana", "Ethan",
             "Fiona", "George", "Hannah", "Ian", "Jane"],
    "Age": [25, 30, 35, 40, 28, 32, 29, 34, 27, 31],
    "Profession": ["Engineer", "Doctor", "Teacher", "Artist", "Nurse",
                   "Lawyer", "Scientist", "Engineer", "Designer", "Chef"],
    "Salary": [70000, 80000, 50000, 60000, 55000, 75000,
               72000, 67000, 52000, 58000]
}

# Create a DataFrame
df = pd.DataFrame(data)

df

Unnamed: 0,Name,Age,Profession,Salary
0,Alice,25,Engineer,70000
1,Bob,30,Doctor,80000
2,Charlie,35,Teacher,50000
3,Diana,40,Artist,60000
4,Ethan,28,Nurse,55000
5,Fiona,32,Lawyer,75000
6,George,29,Scientist,72000
7,Hannah,34,Engineer,67000
8,Ian,27,Designer,52000
9,Jane,31,Chef,58000


In [8]:
# Apply a custom function to a column
df["Age Category"] = df["Age"].apply(lambda x: "Young" if x < 30 else "Experienced")
print("\nDataset with 'Age Category' column:\n", df)


Dataset with 'Age Category' column:
       Name  Age Profession  Salary Age Category
0    Alice   25   Engineer   70000        Young
1      Bob   30     Doctor   80000  Experienced
2  Charlie   35    Teacher   50000  Experienced
3    Diana   40     Artist   60000  Experienced
4    Ethan   28      Nurse   55000        Young
5    Fiona   32     Lawyer   75000  Experienced
6   George   29  Scientist   72000        Young
7   Hannah   34   Engineer   67000  Experienced
8      Ian   27   Designer   52000        Young
9     Jane   31       Chef   58000  Experienced


In [9]:
df.loc[df["Name"] == "Charlie"]

Unnamed: 0,Name,Age,Profession,Salary,Age Category
2,Charlie,35,Teacher,50000,Experienced


In [10]:
df["Name"] == "Charlie"

0    False
1    False
2     True
3    False
4    False
5    False
6    False
7    False
8    False
9    False
Name: Name, dtype: bool

In [11]:
df.iloc[2]

Name                Charlie
Age                      35
Profession          Teacher
Salary                50000
Age Category    Experienced
Name: 2, dtype: object

In [12]:
df["Age"]

0    25
1    30
2    35
3    40
4    28
5    32
6    29
7    34
8    27
9    31
Name: Age, dtype: int64

In [13]:
### Row and Column Selection ###

# Select a specific row using loc (label-based)
print("\nRow with Name == 'Charlie' using loc:\n", df.loc[df["Name"] == "Charlie"])

# Select a specific row using iloc (index-based)
print("\nRow at index 2 using iloc:\n", df.iloc[2])

# Select a specific column
print("\n'Age' column:\n", df["Age"])

# Select multiple columns
print("\n'Name' and 'Salary' columns:\n", df[["Name", "Salary"]])

# Select specific rows and columns using loc
print("\nAge and Salary of rows 0 to 4 using loc:\n", df.loc[0:4, ["Age", "Salary"]])

# Select specific rows and columns using iloc
print("\nFirst 3 rows and first 2 columns using iloc:\n", df.iloc[0:3, 0:2])

### Filtering ###

# Filter rows based on a condition
print("\nRows where Age > 30:\n", df[df["Age"] > 30])

# Filter rows with multiple conditions
print("\nRows where Age > 30 and Salary > 60000:\n", df[(df["Age"] > 30) & (df["Salary"] > 60000)])

### Applying Functions ###

# Apply a custom function to a column
df["Age Category"] = df["Age"].apply(lambda x: "Young" if x < 30 else "Experienced")
print("\nDataset with 'Age Category' column:\n", df)

# Modify a column directly
df["Salary"] = df["Salary"] + 5000  # Adding a fixed increment to all salaries
print("\nDataset after incrementing Salary by 5000:\n", df)

### Searching and Changing Values ###

# Replace a specific value in the DataFrame
df.loc[df["Name"] == "Ethan", "Profession"] = "Data Scientist"
print("\nDataset after changing Ethan's Profession:\n", df)

# Replace multiple values (case-insensitive)
df["Profession"] = df["Profession"].str.replace("engineer", "Engineering Specialist", case=False)
print("\nDataset after replacing 'Engineer':\n", df)

### Renaming Columns ###

# Rename specific columns
df.rename(columns={"Name": "Full Name", "Salary": "Annual Salary"}, inplace=True)
print("\nDataset after renaming columns:\n", df)

# Rename all columns
df.columns = [col.upper() for col in df.columns]
print("\nDataset with all column names in uppercase:\n", df)


Row with Name == 'Charlie' using loc:
       Name  Age Profession  Salary Age Category
2  Charlie   35    Teacher   50000  Experienced

Row at index 2 using iloc:
 Name                Charlie
Age                      35
Profession          Teacher
Salary                50000
Age Category    Experienced
Name: 2, dtype: object

'Age' column:
 0    25
1    30
2    35
3    40
4    28
5    32
6    29
7    34
8    27
9    31
Name: Age, dtype: int64

'Name' and 'Salary' columns:
       Name  Salary
0    Alice   70000
1      Bob   80000
2  Charlie   50000
3    Diana   60000
4    Ethan   55000
5    Fiona   75000
6   George   72000
7   Hannah   67000
8      Ian   52000
9     Jane   58000

Age and Salary of rows 0 to 4 using loc:
    Age  Salary
0   25   70000
1   30   80000
2   35   50000
3   40   60000
4   28   55000

First 3 rows and first 2 columns using iloc:
       Name  Age
0    Alice   25
1      Bob   30
2  Charlie   35

Rows where Age > 30:
       Name  Age Profession  Salary Age Catego