In [0]:
#!pip install -r requirements.txt
#!pip install datapackage

# Introduction to Pandas DataFrame and Series

## Goal:

_**In this Intro Tutorial You Will Learn about**_

1. Series.
2. DataFrame.
3. Slicing.
4. Indexing.
5. Add columns/rows.
6. Dropping columns/rows.
7. Change specific values.
8. Select a specific subset.
9. Filtering and comparing with scaler values.
10. Check for null values.
11. Create a dataset from a dictionary.
12. Assign a datafram date-range index.
13. Join/ Concatenate dataframes. 
14. Select Categorical subset or Numerical, or time columns.
15. Get statistical summary about dataset.
16. Grouping and drop duplicate.
17. Save data to csv or any format - errors handling.
18. Read data from a csv file or any format.
19. Remove undesired columns based on their names.
20. Handing missing values for small dataset.

In [20]:
# ignore warnings from pandas
import warnings
warnings.filterwarnings('ignore')

In [21]:
# import the libraries
import numpy as np
import pandas as pd
from pandas import Series, DataFrame
import re
from datetime import date, datetime

# Series, DataFrames

In [22]:
# Create a Series constructor 
series1= Series(10*np.arange(6), index=['row 1', 'row 2', 'row 3', 'row 4', 'row 5', 'row 6'])
series1

row 1     0
row 2    10
row 3    20
row 4    30
row 5    40
row 6    50
dtype: int32

In [23]:
# Slicing by index number " row number "
series1.iloc[2]
series1[2]

20

In [24]:
# Slicing by index name " row name "
series1.loc['row 4'] 
series1['row 4']

30

In [25]:
# Create DataFrames
df = pd.DataFrame({'A': list('abf..'), 'B': list('bed..')}, dtype="category")
df

Unnamed: 0,A,B
0,a,b
1,b,e
2,f,d
3,.,.
4,.,.


In [26]:
df = df.replace('.', np.nan)
df

Unnamed: 0,A,B
0,a,b
1,b,e
2,f,d
3,,
4,,


In [27]:
# Daterange
print(pd.date_range(start='04-01-2020', end='04-10-2020', freq='D'))
print(pd.date_range(start='04-01-2020', periods=10, freq='D'))

DatetimeIndex(['2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04',
               '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08',
               '2020-04-09', '2020-04-10'],
              dtype='datetime64[ns]', freq='D')
DatetimeIndex(['2020-04-01', '2020-04-02', '2020-04-03', '2020-04-04',
               '2020-04-05', '2020-04-06', '2020-04-07', '2020-04-08',
               '2020-04-09', '2020-04-10'],
              dtype='datetime64[ns]', freq='D')


In [28]:
df = pd.DataFrame({'column 1': range(5), 'column 2': range(5)}, 
                  index = pd.date_range( end='04-01-2020', periods=5, freq='D'),
                  dtype="int64")
df

Unnamed: 0,column 1,column 2
2020-03-28,0,0
2020-03-29,1,1
2020-03-30,2,2
2020-03-31,3,3
2020-04-01,4,4


In [29]:
# Data Frames - tables
np.random.seed(10)
df_obj = pd.DataFrame(10*np.random.rand(64).reshape(8,8),
                      index =['row 1','row 2','row 3','row 4','row 5','row 6','row 7','row 8'], 
                      columns = ['column 1','column 2','column 3','column 4','column 5','column 6','column 7','column 8'])
df_obj

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8
row 1,7.713206,0.207519,6.336482,7.488039,4.98507,2.247966,1.980629,7.605307
row 2,1.691108,0.883398,6.853598,9.533933,0.039483,5.121923,8.12621,6.125261
row 3,7.217553,2.918761,9.177741,7.145758,5.425444,1.4217,3.733408,6.741336
row 4,4.418332,4.34014,6.17767,5.131382,6.503972,6.01039,8.052232,5.216472
row 5,9.086489,3.192361,0.904593,3.007001,1.139844,8.286813,0.468963,6.262871
row 6,5.475862,8.19287,1.989475,8.568503,3.516526,7.546477,2.959617,8.839365
row 7,3.255116,1.650159,3.925292,0.934604,8.211057,1.51152,3.841144,9.442607
row 8,9.876255,4.563045,8.261228,2.513741,5.973716,9.028318,5.345579,5.902014


In [30]:
df_obj.describe()

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8
count,8.0,8.0,8.0,8.0,8.0,8.0,8.0,8.0
mean,6.09174,3.243532,5.45326,5.54037,4.474389,5.146888,4.313473,7.016904
std,2.873009,2.531915,2.9288,3.127016,2.756817,3.092054,2.730658,1.48688
min,1.691108,0.207519,0.904593,0.934604,0.039483,1.4217,0.468963,5.216472
25%,4.127528,1.458469,3.441338,2.883686,2.922356,2.063855,2.71487,6.069449
50%,6.346707,3.055561,6.257076,6.13857,5.205257,5.566156,3.787276,6.502104
75%,8.056527,4.395866,7.205506,7.758155,6.10628,7.731561,6.022243,7.913822
max,9.876255,8.19287,9.177741,9.533933,8.211057,9.028318,8.12621,9.442607


In [31]:
df_obj.dtypes

column 1    float64
column 2    float64
column 3    float64
column 4    float64
column 5    float64
column 6    float64
column 7    float64
column 8    float64
dtype: object

In [32]:
# Obtain a small subset from the entire dataframe
# Slicing 
df_obj.loc[['row 4', 'row 8'], ['column 4', 'column 8']]

Unnamed: 0,column 4,column 8
row 4,5.131382,5.216472
row 8,2.513741,5.902014


In [33]:
# Comparing with Saclers
df_obj > 7

Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8
row 1,True,False,False,True,False,False,False,True
row 2,False,False,False,True,False,False,True,False
row 3,True,False,True,True,False,False,False,False
row 4,False,False,False,False,False,False,True,False
row 5,True,False,False,False,False,True,False,False
row 6,False,True,False,True,False,True,False,True
row 7,False,False,False,False,True,False,False,True
row 8,True,False,True,False,False,True,False,False


In [34]:
# Filtering
df_obj[df_obj['column 4'] > 7]


Unnamed: 0,column 1,column 2,column 3,column 4,column 5,column 6,column 7,column 8
row 1,7.713206,0.207519,6.336482,7.488039,4.98507,2.247966,1.980629,7.605307
row 2,1.691108,0.883398,6.853598,9.533933,0.039483,5.121923,8.12621,6.125261
row 3,7.217553,2.918761,9.177741,7.145758,5.425444,1.4217,3.733408,6.741336
row 6,5.475862,8.19287,1.989475,8.568503,3.516526,7.546477,2.959617,8.839365


### Create a Dataset from a Dictionary

In [35]:
# Define a dictionary containing Students data 
data = {'Name': ['Adnan Masuood', 'Alla Abdella', 'Sam Tony', 'Nora Ahmed', 'John Cena', 'Sam Tony'], 
        'Height': [np.nan, 5.4, 5.1, 5.5, 6, 5.1], 
        'Weight': [160, 150,  170, 158, 167, 148],
        'Degree': ['PhD', 'Msc', 'BSc', 'BSc', np.nan, 'BSc']}
  
# Convert the dictionary into DataFrame 
df = pd.DataFrame(data, index =  pd.date_range(start='04-01-2020', periods=6, freq='D'))
df.index.rename('Joined_Date', inplace=True)
df

Unnamed: 0_level_0,Name,Height,Weight,Degree
Joined_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2020-04-01,Adnan Masuood,,160,PhD
2020-04-02,Alla Abdella,5.4,150,Msc
2020-04-03,Sam Tony,5.1,170,BSc
2020-04-04,Nora Ahmed,5.5,158,BSc
2020-04-05,John Cena,6.0,167,
2020-04-06,Sam Tony,5.1,148,BSc


In [36]:
# Declare a list  
city= ['Tampa', np.nan, 'Orlando', np.nan, 'Tampa', 'San Francisco'] 
  
# Add city to dataframe
# By slicing - appended to the last column
df['City'] = city 
df

Unnamed: 0_level_0,Name,Height,Weight,Degree,City
Joined_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2020-04-01,Adnan Masuood,,160,PhD,Tampa
2020-04-02,Alla Abdella,5.4,150,Msc,
2020-04-03,Sam Tony,5.1,170,BSc,Orlando
2020-04-04,Nora Ahmed,5.5,158,BSc,
2020-04-05,John Cena,6.0,167,,Tampa
2020-04-06,Sam Tony,5.1,148,BSc,San Francisco


In [37]:
# Insert a column in a specific location
# Using DataFrame.insert() to add a column 
df.insert(2, "State", [np.nan, 'FL', 'PA', 'CA', 'OR', 'CA']) 
df

Unnamed: 0_level_0,Name,Height,State,Weight,Degree,City
Joined_Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-04-01,Adnan Masuood,,,160,PhD,Tampa
2020-04-02,Alla Abdella,5.4,FL,150,Msc,
2020-04-03,Sam Tony,5.1,PA,170,BSc,Orlando
2020-04-04,Nora Ahmed,5.5,CA,158,BSc,
2020-04-05,John Cena,6.0,OR,167,,Tampa
2020-04-06,Sam Tony,5.1,CA,148,BSc,San Francisco


In [38]:
# Add a record object/row to a DataFrame
# Pass a series in append() to append a row in dataframe  
df = df.append(pd.Series(['Tom Sonny',np.nan, 'FL', 150, np.nan, 'Denver'], index=df.columns), ignore_index=True) 
df

Unnamed: 0,Name,Height,State,Weight,Degree,City
0,Adnan Masuood,,,160,PhD,Tampa
1,Alla Abdella,5.4,FL,150,Msc,
2,Sam Tony,5.1,PA,170,BSc,Orlando
3,Nora Ahmed,5.5,CA,158,BSc,
4,John Cena,6.0,OR,167,,Tampa
5,Sam Tony,5.1,CA,148,BSc,San Francisco
6,Tom Sonny,,FL,150,,Denver


In [41]:
# Add a record/row into a specific location
df.iloc[2] = ['Alex Alexander', 6.2, np.nan, 159,'PhD', 'Los Angeles']
df.iloc[-1] = ['Alex Alexander', 6.2, np.nan, 159,'PhD', 'Los Angeles']
df

Unnamed: 0,Name,Height,State,Weight,Degree,City
0,Adnan Masuood,,,160,PhD,Tampa
1,Alla Abdella,5.4,FL,150,MSc,
2,Alex Alexander,6.2,,159,PhD,Los Angeles
3,Nora Ahmed,5.5,CA,158,BSc,
4,John Cena,6.0,OR,167,,Tampa
5,Sam Tony,5.1,CA,148,BSc,San Francisco
6,Alex Alexander,6.2,,159,PhD,Los Angeles


In [47]:
# Change a particular value in the DataFrame
df.iloc[1,4] = 'MSc'
df

Unnamed: 0,Name,Height,State,Weight,Degree,City
2,Alex Alexander,6.2,,159,PhD,Los Angeles
5,Sam Tony,5.1,CA,148,MSc,San Francisco
0,Adnan Masuood,,,160,PhD,Tampa
4,John Cena,6.0,OR,167,,Tampa
1,Alla Abdella,5.4,FL,150,MSc,
3,Nora Ahmed,5.5,CA,158,BSc,


In [48]:
# Check for duplicate rows
df.duplicated()

2    False
5    False
0    False
4    False
1    False
3    False
dtype: bool

In [49]:
# Drop duplicated rows, and keep the first occuring row
df = df.drop_duplicates(keep = 'first').reset_index(drop=True)
df

Unnamed: 0,Name,Height,State,Weight,Degree,City
0,Alex Alexander,6.2,,159,PhD,Los Angeles
1,Sam Tony,5.1,CA,148,MSc,San Francisco
2,Adnan Masuood,,,160,PhD,Tampa
3,John Cena,6.0,OR,167,,Tampa
4,Alla Abdella,5.4,FL,150,MSc,
5,Nora Ahmed,5.5,CA,158,BSc,


In [53]:
# Aggregation
df1 = df.groupby(df['Degree']).agg(['mean'])
df1

Unnamed: 0_level_0,Height,Weight
Unnamed: 0_level_1,mean,mean
Degree,Unnamed: 1_level_2,Unnamed: 2_level_2
BSc,5.5,158.0
MSc,5.25,149.0
PhD,6.2,159.5


In [54]:
# Sort data
df.sort_values(by='City', ascending=True, inplace=True)
df

Unnamed: 0,Name,Height,State,Weight,Degree,City
0,Alex Alexander,6.2,,159,PhD,Los Angeles
1,Sam Tony,5.1,CA,148,MSc,San Francisco
2,Adnan Masuood,,,160,PhD,Tampa
3,John Cena,6.0,OR,167,,Tampa
4,Alla Abdella,5.4,FL,150,MSc,
5,Nora Ahmed,5.5,CA,158,BSc,


In [56]:
# Check for null values
df.isnull()
df.isna()
df.notna()

Unnamed: 0,Name,Height,State,Weight,Degree,City
0,True,True,False,True,True,True
1,True,True,True,True,True,True
2,True,False,False,True,True,True
3,True,True,True,True,False,True
4,True,True,True,True,True,False
5,True,True,True,True,True,False


In [57]:
# Select Only Categorical Subset from the Entire DataFrame
df_categorical = df.select_dtypes('object')
df_categorical.head()

Unnamed: 0,Name,State,Degree,City
0,Alex Alexander,,PhD,Los Angeles
1,Sam Tony,CA,MSc,San Francisco
2,Adnan Masuood,,PhD,Tampa
3,John Cena,OR,,Tampa
4,Alla Abdella,FL,MSc,


In [58]:
# Select Only Numerical Subset from the Entire DataFrame
df_numerical = df.select_dtypes(['int64', 'int32', 'float32', 'float64'])
df_numerical

Unnamed: 0,Height,Weight
0,6.2,159
1,5.1,148
2,,160
3,6.0,167
4,5.4,150
5,5.5,158


In [60]:
# Select Only Categorical Subset from the Entire DataFrame
df_categorical = df.select_dtypes('object')
df_categorical.tail(5)

Unnamed: 0,Name,State,Degree,City
1,Sam Tony,CA,MSc,San Francisco
2,Adnan Masuood,,PhD,Tampa
3,John Cena,OR,,Tampa
4,Alla Abdella,FL,MSc,
5,Nora Ahmed,CA,BSc,


In [90]:
# Addded a new column with ordered time
df['Time-Joined'] = pd.date_range(start = '04-02-2020', periods = df.shape[0], freq='H')
df

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00


In [91]:
# Concatenating multiple objects
df_concat = pd.concat([df, df], axis=1).reset_index(drop=True)
df_concat

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined,Name.1,Height.1,State.1,Weight.1,Degree.1,City.1,Time-Joined.1
0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00


In [92]:
# Concatenation by Row
df_concat2 = pd.concat([df, df], axis=0).reset_index(drop=True)
df_concat2

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00
6,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
7,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
8,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
9,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00


In [93]:
# Dropping rows 
df_concat2.drop(df_concat2.index[[10,11]])

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00
6,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
7,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
8,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
9,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00


# Create a Simple Frequency Encoder

In [84]:
# Try this in 5 minutes, write a function to return the frequency percentage of each category in all columns in the previous dataFrame.
def return_frequency(df):
    """ To encode the categories by their frequency"""
    
    # Exclude the name column
    for col in df.columns[~df.columns.str.contains('^Name')]:
    # Encode only the categorical values
      if df[col].dtype == 'O':
    # Keep the numerical column and drop the categorical one
        df[col+'_frequency %'] =  df[col].map((df.groupby(col).size())/len(df))*100
        df.drop(col, axis=1, inplace = True)

    return df

In [94]:
# Dropping columns
df_concat2.drop(['City'], axis=1)

Unnamed: 0,Name,Height,State,Weight,Degree,Time-Joined
0,Alex Alexander,6.2,,159,PhD,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,2020-04-02 05:00:00
6,Alex Alexander,6.2,,159,PhD,2020-04-02 00:00:00
7,Sam Tony,5.1,CA,148,MSc,2020-04-02 01:00:00
8,Adnan Masuood,,,160,PhD,2020-04-02 02:00:00
9,John Cena,6.0,OR,167,,2020-04-02 03:00:00


In [85]:
df2 = df.copy()
df_encoded = return_frequency(df2)
df_encoded

Unnamed: 0,Name,Height,Weight,Time-Joined,State_frequency %,Degree_frequency %,City_frequency %
0,Alex Alexander,6.2,159,2020-04-02 00:00:00,,33.333333,16.666667
1,Sam Tony,5.1,148,2020-04-02 01:00:00,33.333333,33.333333,16.666667
2,Adnan Masuood,,160,2020-04-02 02:00:00,,33.333333,33.333333
3,John Cena,6.0,167,2020-04-02 03:00:00,16.666667,,33.333333
4,Alla Abdella,5.4,150,2020-04-02 04:00:00,16.666667,33.333333,
5,Nora Ahmed,5.5,158,2020-04-02 05:00:00,33.333333,16.666667,


# Fill Missing Values 

In [86]:
# Can we do better?
df3 = df_encoded.copy()
df3.fillna(value=0)

Unnamed: 0,Name,Height,Weight,Time-Joined,State_frequency %,Degree_frequency %,City_frequency %
0,Alex Alexander,6.2,159,2020-04-02 00:00:00,0.0,33.333333,16.666667
1,Sam Tony,5.1,148,2020-04-02 01:00:00,33.333333,33.333333,16.666667
2,Adnan Masuood,0.0,160,2020-04-02 02:00:00,0.0,33.333333,33.333333
3,John Cena,6.0,167,2020-04-02 03:00:00,16.666667,0.0,33.333333
4,Alla Abdella,5.4,150,2020-04-02 04:00:00,16.666667,33.333333,0.0
5,Nora Ahmed,5.5,158,2020-04-02 05:00:00,33.333333,16.666667,0.0


In [88]:
# Can we do even better? 
df_mo_missing = df.copy()
df_mo_missing['Name'] = df_mo_missing.Name.astype('category')
df_mo_missing['State'] = df_mo_missing.State.astype('category')
df_mo_missing['City'] = df_mo_missing.City.astype('category')
df_mo_missing['Degree'] = df_mo_missing.Degree.astype('category')
df_mo_missing = df_mo_missing.fillna(value={'Height': df_mo_missing['Height'].mean()})
df_mo_missing['State'].fillna(value=list(df_mo_missing['State'].mode())[0], inplace=True)
df_mo_missing['City'].fillna(value=list(df_mo_missing['City'].mode())[0], inplace=True)
df_mo_missing['Degree'].fillna(value=list(df_mo_missing['Degree'].mode())[0], inplace=True)
df_mo_missing

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,Alex Alexander,6.2,CA,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,5.64,CA,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,MSc,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,Tampa,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,Tampa,2020-04-02 05:00:00


In [95]:
# Save data
#df.to_csv('df.csv', index=False) 
df.to_csv('df.csv') 

In [96]:
# Read data
# Unnamed: 0: undesired column
df = pd.read_csv('df.csv')
df

Unnamed: 0.1,Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00


In [97]:
# Handle the undesired columns - Save the time
df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
df

Unnamed: 0,Name,Height,State,Weight,Degree,City,Time-Joined
0,Alex Alexander,6.2,,159,PhD,Los Angeles,2020-04-02 00:00:00
1,Sam Tony,5.1,CA,148,MSc,San Francisco,2020-04-02 01:00:00
2,Adnan Masuood,,,160,PhD,Tampa,2020-04-02 02:00:00
3,John Cena,6.0,OR,167,,Tampa,2020-04-02 03:00:00
4,Alla Abdella,5.4,FL,150,MSc,,2020-04-02 04:00:00
5,Nora Ahmed,5.5,CA,158,BSc,,2020-04-02 05:00:00


# End