In [39]:
import pandas as pd

data = pd.read_csv('data/lsd_math_score_data.csv')

In [40]:
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score
0                      5     1.17                78.93
1                     15     2.97                58.20
2                     30     3.26                67.47
3                     60     4.69                37.47
4                    120     5.83                45.65
5                    240     6.00                32.92
6                    480     6.41                29.97


In [41]:
type(data)

pandas.core.frame.DataFrame

# DataFrames

### ✅ Selecting a single column from DataFrame:

In [42]:
only_math_scores = data['Avg_Math_Test_Score']
print(only_math_scores)

0    78.93
1    58.20
2    67.47
3    37.47
4    45.65
5    32.92
6    29.97
Name: Avg_Math_Test_Score, dtype: float64


### ✅ Adding a New Colum to existing DataFrame:

In [43]:
data['Test_Subject'] = 'Rita Lee'
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score Test_Subject
0                      5     1.17                78.93     Rita Lee
1                     15     2.97                58.20     Rita Lee
2                     30     3.26                67.47     Rita Lee
3                     60     4.69                37.47     Rita Lee
4                    120     5.83                45.65     Rita Lee
5                    240     6.00                32.92     Rita Lee
6                    480     6.41                29.97     Rita Lee


### ✅ How to manipulate the values of a column

For example, do calculations on all the values in a single column at the same time

In [44]:
# Setting a new column with all rows containing the value 100
data['High_Score'] = 100

In [45]:
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score Test_Subject  \
0                      5     1.17                78.93     Rita Lee   
1                     15     2.97                58.20     Rita Lee   
2                     30     3.26                67.47     Rita Lee   
3                     60     4.69                37.47     Rita Lee   
4                    120     5.83                45.65     Rita Lee   
5                    240     6.00                32.92     Rita Lee   
6                    480     6.41                29.97     Rita Lee   

   High_Score  
0         100  
1         100  
2         100  
3         100  
4         100  
5         100  
6         100  


In [46]:
# Adding the avg score to column current value
data['High_Score'] = data['High_Score'] + data['Avg_Math_Test_Score']

In [47]:
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score Test_Subject  \
0                      5     1.17                78.93     Rita Lee   
1                     15     2.97                58.20     Rita Lee   
2                     30     3.26                67.47     Rita Lee   
3                     60     4.69                37.47     Rita Lee   
4                    120     5.83                45.65     Rita Lee   
5                    240     6.00                32.92     Rita Lee   
6                    480     6.41                29.97     Rita Lee   

   High_Score  
0      178.93  
1      158.20  
2      167.47  
3      137.47  
4      145.65  
5      132.92  
6      129.97  


In [48]:
data['High_Score'] *= data['High_Score']
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score Test_Subject  \
0                      5     1.17                78.93     Rita Lee   
1                     15     2.97                58.20     Rita Lee   
2                     30     3.26                67.47     Rita Lee   
3                     60     4.69                37.47     Rita Lee   
4                    120     5.83                45.65     Rita Lee   
5                    240     6.00                32.92     Rita Lee   
6                    480     6.41                29.97     Rita Lee   

   High_Score  
0  32015.9449  
1  25027.2400  
2  28046.2009  
3  18898.0009  
4  21213.9225  
5  17667.7264  
6  16892.2009  


## Series

A dataframe is a collection of the **Series** pandas type. Each column in the DataFrame is a Series, so this means we can create a new DataFrame selecting only the series of interest.

In [49]:
# column_list = ['LSD_ppm', 'Avg_Math_Test_Score']
# clean_data = data[column_list]   
clean_data = data[['LSD_ppm', 'Avg_Math_Test_Score']]
print(clean_data)

   LSD_ppm  Avg_Math_Test_Score
0     1.17                78.93
1     2.97                58.20
2     3.26                67.47
3     4.69                37.47
4     5.83                45.65
5     6.00                32.92
6     6.41                29.97


This means that, to create a single column DataFrame we just need to pass a list inside the square brackets.

In [50]:
y = data[['Avg_Math_Test_Score']]

In [51]:
X = data[['LSD_ppm']]
print(X)
type(X)

   LSD_ppm
0     1.17
1     2.97
2     3.26
3     4.69
4     5.83
5     6.00
6     6.41


pandas.core.frame.DataFrame

### ✅ How to delete a column from DataFrame

In [53]:
del data['Test_Subject']
print(data)

   Time_Delay_in_Minutes  LSD_ppm  Avg_Math_Test_Score  High_Score
0                      5     1.17                78.93  32015.9449
1                     15     2.97                58.20  25027.2400
2                     30     3.26                67.47  28046.2009
3                     60     4.69                37.47  18898.0009
4                    120     5.83                45.65  21213.9225
5                    240     6.00                32.92  17667.7264
6                    480     6.41                29.97  16892.2009
