# Chapter 2 - Pandas Structures

# 2.2 Creating your own data

# 2.2.1 Creating a series

In [1]:
import pandas as pd

In [2]:
# Create a series with default index values

s = pd.Series(["banana",42]) 
s

0    banana
1        42
dtype: object

In [3]:
# Manually assign index values to a series

s = pd.Series(["Michael Jackson","Singer"], index = ["person","who"])
s

person    Michael Jackson
who                Singer
dtype: object

# 2.2.2 Creating a dataframe 

In [4]:
# Creating a dataframe from a dictionary (order is not guaranteed)
# index by default

scientists = pd.DataFrame({
"Name":["Rosaline Franklin", "William Gosset"], 
"Occupation":["Chemist","Statistician"], 
"Born":["1920-07-25","1876-06-13"], 
"Died":["1958-04-16","1937-10-16"], 
"Age":[37,61]})

scientists


Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


In [5]:
# Creating a dataframe from a dictionary (order is not guaranteed)
# index indicated: In this case by column names

scientists = pd.DataFrame(
    data = {"Occupation":["Chemist","Statistician"], 
            "Born":["1920-07-25","1876-06-13"], 
            "Died":["1958-04-16","1937-10-16"], 
            "Age":[37,61]}, 
    index = ["Rosaline Franklin", "William Gosset"] ,
    columns = ["Occupation", "Born","Died","Age"])

scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [6]:
# Creating a dataframe using OrderedDict (order is guaranteed but we have to use tuples)


from collections import OrderedDict

scientists = pd.DataFrame(OrderedDict([
("Name", ["Rosaline Franklin", "William Gosset"]),
("Occupation", ["Chemist","Statistician"]), 
("Born", ["1920-07-25","1876-06-13"]),
("Died", ["1958-04-16","1937-10-16"]), 
("Age", [37,61])]))

scientists


Unnamed: 0,Name,Occupation,Born,Died,Age
0,Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
1,William Gosset,Statistician,1876-06-13,1937-10-16,61


# 2.3 The series

In [7]:
# First, let's create a dataframe with a row index label

scientists = pd.DataFrame(data = 
{"Occupation":["Chemist","Statistician"], 
"Born":["1920-07-25","1876-06-13"], 
"Died":["1958-04-16","1937-10-16"], 
"Age":[37,61]}, 
index = ["Rosaline Franklin", "William Gosset"] ,
columns = ["Occupation", "Born","Died","Age"] )

scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [8]:
#Let's print the information as Series

scientists.loc["William Gosset"]


Occupation    Statistician
Born            1876-06-13
Died            1937-10-16
Age                     61
Name: William Gosset, dtype: object

In [9]:
#Print attributes and methods associated with the Series

# row index
scientists.loc["William Gosset"].index


Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

In [10]:
#Print attributes and methods associated with the Series

#row values
scientists.loc["William Gosset"].values


array(['Statistician', '1876-06-13', '1937-10-16', 61], dtype=object)

In [11]:
#Print attributes and methods associated with the Series

# row keys

scientists.loc["William Gosset"].keys()

Index(['Occupation', 'Born', 'Died', 'Age'], dtype='object')

In [12]:
# series methods and attributes

print("these are differences between attributes such as index and methods such as keys. See how the () is used:")

scientists.loc["William Gosset"].index[0], scientists.loc["William Gosset"].keys()[0]


these are differences between attributes such as index and methods such as keys. See how the () is used:


('Occupation', 'Occupation')

# 2.3.1  The series is ndarray-like

Pandas series are similar to numpy.ndarray. Many methods and functions that operate in an ndarray will also operate in a Pandas Series. A series may sometimes be referred as a "vector"  

# 2.3.1.1 Series methods

In [13]:
#Let's take a look at the dataframe

scientists

Unnamed: 0,Occupation,Born,Died,Age
Rosaline Franklin,Chemist,1920-07-25,1958-04-16,37
William Gosset,Statistician,1876-06-13,1937-10-16,61


In [14]:
# Series methods are listed on page 31. (Table 2.2)

# print ages mean,min,max,std

print(scientists["Age"].mean())
print(scientists["Age"].min())
print(scientists["Age"].max())
print(scientists["Age"].std())


49.0
37
61
16.97056274847714


# 2.3.2 Boolean subsetting: Series

In [15]:
# let's import a larger dataset

scientists = pd.read_csv("C:/Users/adri_/Documents/GitHub- Adriana/Pandas for everyone/data/scientists.csv")

In [16]:
# Let's extract the age column
scientists["Age"]

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [17]:
# Get basic stats by default

# Use describe function

scientists["Age"].describe()


count     8.000000
mean     59.125000
std      18.325918
min      37.000000
25%      44.000000
50%      58.500000
75%      68.750000
max      90.000000
Name: Age, dtype: float64

In [18]:
# Get mean of all ages

scientists["Age"].mean()

59.125

In [19]:
# subset values above the mean. 
    
scientists["Age"][scientists["Age"] > scientists["Age"].mean()]

1    61
2    90
3    66
7    77
Name: Age, dtype: int64

In [20]:
# Display boolean of condition aboe the mean.

    
scientists["Age"] > scientists["Age"].mean()

0    False
1     True
2     True
3     True
4    False
5    False
6    False
7     True
Name: Age, dtype: bool

# 2.3.3 Operations are automatically aligned and vectorized (Broadcasting)

# 2.3.3.1 Vectors of the same length

In [21]:
# Let's look at the dataframe.

# It is a vector.

scientists["Age"]

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [22]:
# Addition of vectors

scientists["Age"] + scientists["Age"]

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

In [23]:
# Multiplication of vectors

scientists["Age"] * scientists["Age"]

0    1369
1    3721
2    8100
3    4356
4    3136
5    2025
6    1681
7    5929
Name: Age, dtype: int64

# 2.3.3.2 Vectors with integers (scalar)

In [24]:
# Addition of vector and scalar

scientists["Age"] + 100

0    137
1    161
2    190
3    166
4    156
5    145
6    141
7    177
Name: Age, dtype: int64

In [25]:
# Multiplication of vector and scalar

scientists["Age"] * 2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

# 2.3.3.3 Vectors with different lenghts 

In Pandas if vectors are not of the same lenght and if we are working with a Series, a "broadcast" will occur. 
This means it will perform the operation matched by the index and where there are missing values it will place a "Nan".

With other types (not a Series), the shapes must match.

In [26]:
#Let's look at the first vector

vector_1 = scientists["Age"]
vector_1


0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [27]:
# Let's look at the second vector

vector_2 = [1,100]
vector_2

[1, 100]

In [28]:
# Addition of vector_1 and vector_2:

vector_1 + pd.Series(vector_2)

0     38.0
1    161.0
2      NaN
3      NaN
4      NaN
5      NaN
6      NaN
7      NaN
dtype: float64

# 2.3.3.4 Vectors with common index labels (automatic alignment)

In [29]:
#Let's look at the first vector

vector_1 = scientists["Age"]
vector_1

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [30]:
# Let's look at the second vector

vector_2 = scientists["Age"].sort_index(ascending = False)
vector_2

7    77
6    41
5    45
4    56
3    66
2    90
1    61
0    37
Name: Age, dtype: int64

In [31]:
# Addition of vector_1 and vector_2 using the index labels:

vector_1 + vector_2

0     74
1    122
2    180
3    132
4    112
5     90
6     82
7    154
Name: Age, dtype: int64

# 2.4 The Dataframe

# 2.4.1 Boolean Subsetting: Dataframes

In [32]:
# Let's look at the dataframe
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [33]:
# Let's look at a subset of one (1) variable, above the mean.

# Boolean vectors will subset rows
    
scientists[scientists["Age"] > scientists["Age"].mean()]

Unnamed: 0,Name,Born,Died,Age,Occupation
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


# 2.4.2 Operations are automatically aligned and vectorized (Broadcasting)

In [34]:
# Let's look at the dataframe
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist
1,William Gosset,1876-06-13,1937-10-16,61,Statistician
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist
5,John Snow,1813-03-15,1858-06-16,45,Physician
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician


In [35]:
# Multiplication of vectors of dataframe by a scalar.

# Will double the strings and multiply he numbers by 2.

scientists * 2

Unnamed: 0,Name,Born,Died,Age,Occupation
0,Rosaline FranklinRosaline Franklin,1920-07-251920-07-25,1958-04-161958-04-16,74,ChemistChemist
1,William GossetWilliam Gosset,1876-06-131876-06-13,1937-10-161937-10-16,122,StatisticianStatistician
2,Florence NightingaleFlorence Nightingale,1820-05-121820-05-12,1910-08-131910-08-13,180,NurseNurse
3,Marie CurieMarie Curie,1867-11-071867-11-07,1934-07-041934-07-04,132,ChemistChemist
4,Rachel CarsonRachel Carson,1907-05-271907-05-27,1964-04-141964-04-14,112,BiologistBiologist
5,John SnowJohn Snow,1813-03-151813-03-15,1858-06-161858-06-16,90,PhysicianPhysician
6,Alan TuringAlan Turing,1912-06-231912-06-23,1954-06-071954-06-07,82,Computer ScientistComputer Scientist
7,Johann GaussJohann Gauss,1777-04-301777-04-30,1855-02-231855-02-23,154,MathematicianMathematician


# 2.5 Making changes to Series and Dataframes

# 2.5.1 Add Additional Columns

In [36]:
# Format the "born" column as datetime
pd.to_datetime(scientists["Born"], format = "%Y-%m-%d")

0   1920-07-25
1   1876-06-13
2   1820-05-12
3   1867-11-07
4   1907-05-27
5   1813-03-15
6   1912-06-23
7   1777-04-30
Name: Born, dtype: datetime64[ns]

In [37]:
# Format the "Died" column as datetime
pd.to_datetime(scientists["Died"], format = "%Y-%m-%d")

0   1958-04-16
1   1937-10-16
2   1910-08-13
3   1934-07-04
4   1964-04-14
5   1858-06-16
6   1954-06-07
7   1855-02-23
Name: Died, dtype: datetime64[ns]

In [38]:
# Create a new column using the datetime columns of "born"

scientists["born_new_column"] = pd.to_datetime(scientists["Born"], format = "%Y-%m-%d")
scientists



Unnamed: 0,Name,Born,Died,Age,Occupation,born_new_column
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30


In [39]:
# Create a new column using the datetime columns of "Died"

scientists["died_new_column"] = pd.to_datetime(scientists["Died"], format = "%Y-%m-%d")
scientists


Unnamed: 0,Name,Born,Died,Age,Occupation,born_new_column,died_new_column
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23


In [40]:
# Create a new column using the datetime columns

# Calculate days lived

scientists["days_lived"] = (scientists["died_new_column"] - scientists["born_new_column"])
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_new_column,died_new_column,days_lived
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,28422 days


# 2.5.2 Directly change a column using random values

In [41]:
# Let's take a look at the dataframe
scientists

Unnamed: 0,Name,Born,Died,Age,Occupation,born_new_column,died_new_column,days_lived
0,Rosaline Franklin,1920-07-25,1958-04-16,37,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,61,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,90,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,66,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,56,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,45,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,41,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,77,Mathematician,1777-04-30,1855-02-23,28422 days


In [42]:
# Let's select the column "Age"

scientists["Age"]

0    37
1    61
2    90
3    66
4    56
5    45
6    41
7    77
Name: Age, dtype: int64

In [43]:
# Let's shuffle the values using the random function

import random


In [59]:
# Set a seed so the randomness is always the same
random.seed(42)

#shuffle
random.shuffle(scientists["Age"])
scientists["Age"]


0    61
1    41
2    66
3    56
4    37
5    45
6    77
7    90
Name: Age, dtype: int64

# 2.5.3 Dropping values

In [45]:
# Show all columns in data

scientists.columns

Index(['Name', 'Born', 'Died', 'Age', 'Occupation', 'born_new_column',
       'died_new_column', 'days_lived'],
      dtype='object')

In [46]:
# Drop one (1) columns

# Dropping the column "Age"

scientists.drop(["Age"], axis = 1)


Unnamed: 0,Name,Born,Died,Occupation,born_new_column,died_new_column,days_lived
0,Rosaline Franklin,1920-07-25,1958-04-16,Chemist,1920-07-25,1958-04-16,13779 days
1,William Gosset,1876-06-13,1937-10-16,Statistician,1876-06-13,1937-10-16,22404 days
2,Florence Nightingale,1820-05-12,1910-08-13,Nurse,1820-05-12,1910-08-13,32964 days
3,Marie Curie,1867-11-07,1934-07-04,Chemist,1867-11-07,1934-07-04,24345 days
4,Rachel Carson,1907-05-27,1964-04-14,Biologist,1907-05-27,1964-04-14,20777 days
5,John Snow,1813-03-15,1858-06-16,Physician,1813-03-15,1858-06-16,16529 days
6,Alan Turing,1912-06-23,1954-06-07,Computer Scientist,1912-06-23,1954-06-07,15324 days
7,Johann Gauss,1777-04-30,1855-02-23,Mathematician,1777-04-30,1855-02-23,28422 days


# 2.6 Exporting and Importing Data in Pandas (Pickle data)

# 2.6.1.1 Series

In [47]:
# Let's select a column by name

scientists["Name"]

0       Rosaline Franklin
1          William Gosset
2    Florence Nightingale
3             Marie Curie
4           Rachel Carson
5               John Snow
6             Alan Turing
7            Johann Gauss
Name: Name, dtype: object

In [48]:
# Pass in a string to the path you want to save:

# scientists["Name"].to_pickle(".....directory..... /output/scientists_name_series.pickle")


# 2.6.1.2 Dataframe

In [49]:
# Pass in a string to the path you want to save:

#scientists["Name"].to_pickle(".....directory..... /output/scientists_name_df.pickle")


# 2.6.1.3 Reading pickle data

In [50]:
# For Series, use the pd.read_pickle function

# pd.read_pickle(".....directory..... /output/scientists_name_series.pickle")

In [51]:
# For dataframes, use the pd.read_pickle function

# pd.read_pickle(".....directory..... /output/scientists_name_df.pickle")

# 2.6.2 CSV files

In [52]:
# Save a Series into a csv file

# scientists["Name"].to_csv(".....directory..... /output/scientists_name_series.csv")

In [53]:
# Save a dataframe into a tsv file (tab separated values)

# scientists["Name"].to_csv(".....directory..... /output/scientists_name_series.tsv", sep="\t")

In [54]:
# Importing csv data

# use the pd.read_csv function

# 2.6.3 Excel

# 2.6.3.1 Series

In [55]:
# Save a series into a Excel file

# Convert the Series into a dataframe first:
names_df = scientists["Name"].to_frame() 

# install function for xls and save the file
import xlwt
# names_df.to_excel(".....directory..... /output/scientists_name_series_df.xls")


# install function for xlsx and save the file
import openpyxl
# names_df.to_excel(".....directory..... /output/scientists_name_series_df.xlsx")


    

# 2.6.3.2. Dataframe

In [56]:
# Saving a dataframe into an Excel format

# scientitsts.to_excel(".....directory..... /output/scientists_df.xlsx", sheet_name = "scientists", index = False)

# 2.6.5 Other data output types

In [57]:
# Pag 48 chapter 2.