In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

In [2]:
df = pd.read_csv("train.csv")

In [3]:
# Show basic info
print("Dataset Info:")
df.info()

Dataset Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [4]:
# Check dimensions
print("\nShape of the dataset:", df.shape)


Shape of the dataset: (891, 12)


In [5]:
# Check for missing values
print("\nMissing values in each column:")
print(df.isnull().sum())


Missing values in each column:
PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64


In [6]:
# Get summary statistics
print("\nSummary statistics:")
print(df.describe(include='all'))


Summary statistics:
        PassengerId    Survived      Pclass                     Name   Sex  \
count    891.000000  891.000000  891.000000                      891   891   
unique          NaN         NaN         NaN                      891     2   
top             NaN         NaN         NaN  Braund, Mr. Owen Harris  male   
freq            NaN         NaN         NaN                        1   577   
mean     446.000000    0.383838    2.308642                      NaN   NaN   
std      257.353842    0.486592    0.836071                      NaN   NaN   
min        1.000000    0.000000    1.000000                      NaN   NaN   
25%      223.500000    0.000000    2.000000                      NaN   NaN   
50%      446.000000    0.000000    3.000000                      NaN   NaN   
75%      668.500000    1.000000    3.000000                      NaN   NaN   
max      891.000000    1.000000    3.000000                      NaN   NaN   

               Age       SibSp       Parch

In [7]:
df['Age'] = df['Age'].fillna(20)

In [8]:
# Drop 'Cabin' due to excessive missing values
df = df.drop(columns=['Cabin'])

In [9]:
df = pd.get_dummies(df, columns=['Embarked'], drop_first=True)  # One-hot encoding

In [10]:
# Check current data types
print("\nOriginal Data Types:")
print(df.dtypes)


Original Data Types:
PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked_Q       uint8
Embarked_S       uint8
dtype: object


In [11]:
df = pd.get_dummies(df, columns=['Sex', 'Pclass'], drop_first=True)

In [12]:
scaler = MinMaxScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

In [13]:
print(df.dtypes)

PassengerId      int64
Survived         int64
Name            object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Embarked_Q       uint8
Embarked_S       uint8
Sex_male         uint8
Pclass_2         uint8
Pclass_3         uint8
dtype: object


In [14]:
df['Name'] = df['Name'].astype('string')
df['Ticket'] = df['Ticket'].astype('string')


In [15]:
df.dtypes


PassengerId      int64
Survived         int64
Name            string
Age            float64
SibSp            int64
Parch            int64
Ticket          string
Fare           float64
Embarked_Q       uint8
Embarked_S       uint8
Sex_male         uint8
Pclass_2         uint8
Pclass_3         uint8
dtype: object