## **Data Gathering and Exploration**

In [3]:
from google.colab import files
files.upload()


Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"cecileeymond","key":"f1d173eeb5ac2816b48cddc525f2b5d3"}'}

In [4]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [15]:
!pip install kaggle



In [5]:
!kaggle competitions download -c titanic
!unzip titanic.zip

Downloading titanic.zip to /content
  0% 0.00/34.1k [00:00<?, ?B/s]
100% 34.1k/34.1k [00:00<00:00, 129MB/s]
Archive:  titanic.zip
  inflating: gender_submission.csv   
  inflating: test.csv                
  inflating: train.csv               


# **Exploring the data**

In [6]:
import pandas as pd

# Load the Titanic dataset
titanic_data = pd.read_csv('train.csv')

# Explore the dataset
print(titanic_data.head())
print('\n')
titanic_data.info()

   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex   Age  SibSp  \
0                            Braund, Mr. Owen Harris    male  22.0      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female  38.0      1   
2                             Heikkinen, Miss. Laina  female  26.0      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female  35.0      1   
4                           Allen, Mr. William Henry    male  35.0      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  




In [14]:
# Remove duplicate rows
titanic_data = titanic_data.drop_duplicates()

# Verify duplicates are removed
print(titanic_data.duplicated().sum())

# Remove irrelevant columns (e.g., 'Cabin' and 'Ticket')
# code qui ne marche pas, error pour identifier cabine et ticket sur axis 1
# Remove irrelevant columns (e.g., 'Cabin' and 'Ticket')
# titanic_data = titanic_data.drop(['Cabin', 'Ticket'], axis=1)
## solution Gemini:
columns_to_drop = ['Cabin', 'Ticket']
titanic_data = titanic_data.drop(columns=[col for col in columns_to_drop if col in titanic_data.columns])

titanic_data.info()

0
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Sex             891 non-null    object 
 5   Age             714 non-null    float64
 6   SibSp           891 non-null    int64  
 7   Parch           891 non-null    int64  
 8   Fare            891 non-null    float64
 9   Embarked        889 non-null    object 
 10  Age_normalized  714 non-null    float64
dtypes: float64(3), int64(5), object(3)
memory usage: 76.7+ KB


In [15]:
# Fixing Structural Errors
# Structural errors include data format issue: converting dates
# Convert 'Date' column to datetime format
titanic_data['Date'] = pd.to_datetime(titanic_data['Date']) # ex: pas de colonne date dans le fichier titanic
titanic_data.head()

KeyError: 'Date'

# data **normalization**

In [8]:
# how to perform min-max normalization on a numerical column, e.g., Age:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
titanic_data['Age_normalized'] = scaler.fit_transform(titanic_data[['Age']]) # une colonne en plus

titanic_data.info()
titanic_data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   PassengerId     891 non-null    int64  
 1   Survived        891 non-null    int64  
 2   Pclass          891 non-null    int64  
 3   Name            891 non-null    object 
 4   Sex             891 non-null    object 
 5   Age             714 non-null    float64
 6   SibSp           891 non-null    int64  
 7   Parch           891 non-null    int64  
 8   Ticket          891 non-null    object 
 9   Fare            891 non-null    float64
 10  Cabin           204 non-null    object 
 11  Embarked        889 non-null    object 
 12  Age_normalized  714 non-null    float64
dtypes: float64(3), int64(5), object(5)
memory usage: 90.6+ KB


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Age_normalized
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S,0.271174
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C,0.472229
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S,0.321438
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S,0.434531
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S,0.434531


# **Data reduction**

Data reduction involves reducing the volume but producing the same or similar analytical results. It helps in handling large datasets efficiently.

Principal Component Analysis (PCA)



In [9]:
from sklearn.decomposition import PCA
#This line creates an instance of the PCA class with the parameter n_components set to 2.
# It specifies that we want to reduce the dimensionality of the data to 2 principal components.
# In other words, the data will be projected into a 2D space.

#!! uniquement sur type de variables

pca = PCA(n_components=2)
# reduced_data = pca.fit_transform(original_data)
reduced_data = pca.fit_transform(titanic_data)

ValueError: could not convert string to float: 'Braund, Mr. Owen Harris'