# Pandas Basics

In [3]:
import pandas as pd
import numpy as np
# creating dataframe from numpy arrays
names = np.array(["Aroosa","Maria","Noor"]) # numpy array having names
ages = np.random.randint(20,40, size = 3)   # ages are from 20 to 30 and 3 ages are selected randomly
salaries = np.linspace(30000,60000,3)       # salaries are from 30000 to 60000 and 3 salaries are selected evenly between two numbers

df = pd.DataFrame({"Name":names,
              "Age":ages,
              "Salary":salaries})
df

Unnamed: 0,Name,Age,Salary
0,Aroosa,22,30000.0
1,Maria,30,45000.0
2,Noor,31,60000.0


In [4]:
# here tested.csv file is being loaded and read
df = pd.read_csv("tested.csv")
df.head() # the first five rows are showed by default. we can select how many we want to show as well

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,0,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,1,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,0,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,0,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,1,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [7]:
# basic exploration
print(df.shape)   #Rows and columns
print(df.info())  # non null counts and data types
df.describe()     # summary statistics

(418, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          332 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,418.0,418.0,418.0,332.0,418.0,418.0,417.0
mean,1100.5,0.363636,2.26555,30.27259,0.447368,0.392344,35.627188
std,120.810458,0.481622,0.841838,14.181209,0.89676,0.981429,55.907576
min,892.0,0.0,1.0,0.17,0.0,0.0,0.0
25%,996.25,0.0,1.0,21.0,0.0,0.0,7.8958
50%,1100.5,0.0,3.0,27.0,0.0,0.0,14.4542
75%,1204.75,1.0,3.0,39.0,1.0,0.0,31.5
max,1309.0,1.0,3.0,76.0,8.0,9.0,512.3292


In [11]:
# handling missing values
df.isnull().sum()       # counting the sum of all the nulls in every column
df['Age'].fillna(df['Age'].mean(), inplace = True) # filling all the null ages values with the mean of age(inplace= True: modifies the dataframe directly without creating a new one)
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Age'].fillna(df['Age'].mean(), inplace = True)


In [13]:
# dropping rows with missing values
df.dropna()
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  418 non-null    int64  
 1   Survived     418 non-null    int64  
 2   Pclass       418 non-null    int64  
 3   Name         418 non-null    object 
 4   Sex          418 non-null    object 
 5   Age          418 non-null    float64
 6   SibSp        418 non-null    int64  
 7   Parch        418 non-null    int64  
 8   Ticket       418 non-null    object 
 9   Fare         417 non-null    float64
 10  Cabin        91 non-null     object 
 11  Embarked     418 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 39.3+ KB
None


In [14]:
# dropping the columns we don't need
df.drop(columns=["Cabin","Ticket","Name","Embarked","PassengerId","SibSp"],inplace=True) #dropped multiple columns
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418 entries, 0 to 417
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Survived  418 non-null    int64  
 1   Pclass    418 non-null    int64  
 2   Sex       418 non-null    object 
 3   Age       418 non-null    float64
 4   Parch     418 non-null    int64  
 5   Fare      417 non-null    float64
dtypes: float64(2), int64(3), object(1)
memory usage: 19.7+ KB
None


In [15]:
print(df.head())

   Survived  Pclass     Sex   Age  Parch     Fare
0         0       3    male  34.5      0   7.8292
1         1       3  female  47.0      0   7.0000
2         0       2    male  62.0      0   9.6875
3         0       3    male  27.0      0   8.6625
4         1       3  female  22.0      1  12.2875


In [16]:
X = df.iloc[:,1:] # all rows and all columns as well except the first column
Y = df['Survived'] # this is the first columns

In [17]:
X

Unnamed: 0,Pclass,Sex,Age,Parch,Fare
0,3,male,34.50000,0,7.8292
1,3,female,47.00000,0,7.0000
2,2,male,62.00000,0,9.6875
3,3,male,27.00000,0,8.6625
4,3,female,22.00000,1,12.2875
...,...,...,...,...,...
413,3,male,30.27259,0,8.0500
414,1,female,39.00000,0,108.9000
415,3,male,38.50000,0,7.2500
416,3,male,30.27259,0,8.0500


In [18]:
Y

Unnamed: 0,Survived
0,0
1,1
2,0
3,0
4,1
...,...
413,0
414,1
415,0
416,0
