In [1]:
import pandas as pd
import numpy as np

# ***Pandas Series***

In [2]:
pd.Series(range(1000))

0        0
1        1
2        2
3        3
4        4
      ... 
995    995
996    996
997    997
998    998
999    999
Length: 1000, dtype: int64

In [3]:
pd.Series(np.arange(0, 100, 2), dtype = np.int64).sum()

2450

In [4]:
numbers = pd.Series(range(0, 1000, 15))

In [5]:
numbers.dtype.name

'int64'

In [6]:
numbers.index

RangeIndex(start=0, stop=67, step=1)

In [7]:
names = pd.Series(["James", "Julia", "Bose", "Juan", "Belle", "Leslie", "Saurez", "Lenny", "Chadwick", "Natalie"])

In [8]:
names

0       James
1       Julia
2        Bose
3        Juan
4       Belle
5      Leslie
6      Saurez
7       Lenny
8    Chadwick
9     Natalie
dtype: object

In [9]:
names.index = range(100, 110, 1)

In [10]:
names

100       James
101       Julia
102        Bose
103        Juan
104       Belle
105      Leslie
106      Saurez
107       Lenny
108    Chadwick
109     Natalie
dtype: object

In [11]:
names[0]

KeyError: 0

In [12]:
names[100]

'James'

In [13]:
# numpy array elements must be of the same type
np.array(["Hello", 0.78, 78967, ("Hi there!", 78), None])

ValueError: setting an array element with a sequence. The requested array has an inhomogeneous shape after 1 dimensions. The detected shape was (5,) + inhomogeneous part.

In [14]:
# but the elements of pandas series do not!
pd.Series(["Hello", 0.78, 78967, ("Hi there!", 78), None], index = ["string", "float", "integer", "tuple", "NoneType"])

string                Hello
float                  0.78
integer               78967
tuple       (Hi there!, 78)
NoneType               None
dtype: object

In [15]:
# when the series contains elements of type object, None type remains None
# but in numeric series, None is casted into nan

pd.Series(list(range(10)) + [None])

0     0.0
1     1.0
2     2.0
3     3.0
4     4.0
5     5.0
6     6.0
7     7.0
8     8.0
9     9.0
10    NaN
dtype: float64

In [16]:
# in pandas None is represented as NaN which is actually a float!! Yikes!
pd.Series(list(range(10)) + [None], dtype = np.int64)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [17]:
pd.Series(list(range(10)) + [None], dtype = np.float64)

0     0.0
1     1.0
2     2.0
3     3.0
4     4.0
5     5.0
6     6.0
7     7.0
8     8.0
9     9.0
10    NaN
dtype: float64

In [18]:
# NaN is not equal to Python's native None
np.nan == None

False

In [19]:
np.array([0, 1, 2, 3, 4, None])

array([0, 1, 2, 3, 4, None], dtype=object)

In [20]:
np.array([0, 1, 2, 3, 4, None], dtype = float)

array([ 0.,  1.,  2.,  3.,  4., nan])

In [21]:
# None needs to be casted to a nan in Numpy arrays asa well, in a floats array
np.array([0, 1, 2, 3, 4, None], dtype = int)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'

In [22]:
Joe = {
    "Name": "Joel",
    "Age": 29,
    "Hobbies": ["Football", "Carrom", "Ice skating"],
    "Sex": "Male",
    "Job": "Backend developer"
}

In [23]:
# wehen creating a series from a dict, the keys become the indices and the values become the series elements
pd.Series(Joe)

Name                                  Joel
Age                                     29
Hobbies    [Football, Carrom, Ice skating]
Sex                                   Male
Job                      Backend developer
dtype: object

In [24]:
pd.Series(Joe.values(), index = Joe.keys())

Name                                  Joel
Age                                     29
Hobbies    [Football, Carrom, Ice skating]
Sex                                   Male
Job                      Backend developer
dtype: object

In [25]:
pd.Series(Joe) == pd.Series(Joe.values(), index = Joe.keys())

Name       True
Age        True
Hobbies    True
Sex        True
Job        True
dtype: bool

In [26]:
pd.Series(Joe).index

Index(['Name', 'Age', 'Hobbies', 'Sex', 'Job'], dtype='object')

In [27]:
pd.Series(Joe).values

array(['Joel', 29, list(['Football', 'Carrom', 'Ice skating']), 'Male',
       'Backend developer'], dtype=object)

In [28]:
Julia = {
    "fName": "Julia",
    "lName": "Roberts",
    "Age": 42,
    "Sex": "Female",
    "Married": False,
    "Education": "B.A Economics"
}

In [29]:
Julia.keys()

dict_keys(['fName', 'lName', 'Age', 'Sex', 'Married', 'Education'])

In [30]:
pd.Series(Julia, index = list(Julia.keys()) + ["Nationality"])

fName                  Julia
lName                Roberts
Age                       42
Sex                   Female
Married                False
Education      B.A Economics
Nationality              NaN
dtype: object

In [31]:
# education is gone and Nationality is added with Nan as its value
pd.Series(Julia, index = ['fName', 'lName', 'Age', 'Sex', 'Married', 'Nationality'])

fName            Julia
lName          Roberts
Age                 42
Sex             Female
Married          False
Nationality        NaN
dtype: object

In [13]:
sepal_length = pd.read_csv("D:/DataSets/Iris-dataset.csv").loc[:, "sepal.length"]

In [14]:
sepal_length.index = pd.read_csv("D:/DataSets/Iris-dataset.csv").variety

In [16]:
sepal_length

variety
Setosa       5.1
Setosa       4.9
Setosa       4.7
Setosa       4.6
Setosa       5.0
            ... 
Virginica    6.7
Virginica    6.3
Virginica    6.5
Virginica    6.2
Virginica    5.9
Name: sepal.length, Length: 150, dtype: float64

In [18]:
sepal_length[0]

5.1

In [19]:
sepal_length.iloc[18]

5.7

In [21]:
sepal_length.loc["Setosa"][:10]

variety
Setosa    5.1
Setosa    4.9
Setosa    4.7
Setosa    4.6
Setosa    5.0
Setosa    5.4
Setosa    4.6
Setosa    5.0
Setosa    4.4
Setosa    4.9
Name: sepal.length, dtype: float64

In [22]:
sepal_length.shape

(150,)

In [23]:
sepal_length.index = range(150, 300,1)

In [25]:
sepal_length[:10]

  sepal_length[:10]


150    5.1
151    4.9
152    4.7
153    4.6
154    5.0
155    5.4
156    4.6
157    5.0
158    4.4
159    4.9
Name: sepal.length, dtype: float64

In [26]:
sepal_length.iloc[:10]

150    5.1
151    4.9
152    4.7
153    4.6
154    5.0
155    5.4
156    4.6
157    5.0
158    4.4
159    4.9
Name: sepal.length, dtype: float64

In [30]:
sepal_length[299]

5.9

In [3]:
sepal_length.loc[299]

NameError: name 'sepal_length' is not defined

In [4]:
numbers = np.random.randint(0, 100, 1_000_000)

In [5]:
%%timeit

ave = 0
for i in numbers:
    ave += i
print(f"Average of the 1,000,000 random numbers is {ave/ len(numbers)}")

Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.476638
Average of the 1,000,000 random numbers is 49.

In [6]:
%%timeit

numbers.mean()

672 µs ± 10.9 µs per loop (mean ± std. dev. of 7 runs, 1,000 loops each)


In [7]:
# array operations in pandas Series
sum = numbers.sum()
print(sum)

new_sum = (numbers + 10.987).sum()
print(new_sum)

print(f"Is {new_sum} equal to {sum + (10.987 * len(numbers))}?")

49476638
60463638.00000007
Is 60463638.00000007 equal to 60463638.0?


In [8]:
x = pd.Series(range(10))
y = pd.Series(range(10, 20))

In [10]:
joined = pd.concat([x, y], axis = 0)
joined.reset_index(drop = True, inplace = True)

In [11]:
joined

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
dtype: int64

In [12]:
# .loc looks up for indices (the index values not the enumerated offset!!)
# .iloc looks up the enumerated offset from the strating point

langs = pd.Series([1, 2, 3, 4, 5], index = ["C", "Python", "C#", "assembly", "Go"])

In [14]:
# pandas infers that this asks for the element at the enumerated offet 0
langs[0]

1

In [15]:
langs["C"]

1

In [16]:
langs.loc["C"]

1

In [17]:
langs.loc["Python"]

2

In [19]:
langs.iloc["C#"]

TypeError: Cannot index by location index with a non-integer key

In [20]:
langs.loc[4]

KeyError: 4

### ***To avoid unnecessary confusions, just avoid raw subscriptions. Use the explicit .loc[] and .iloc[] methods!***

# ***Pandas DataFrames***

In [24]:
grades = pd.read_csv(r"D:/Introduction-to-Data-Science-in-Python/week-2/datasets/class_grades.csv")

In [25]:
grades.head()

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.5
1,8,95.05,105.49,67.5,99.07,68.33
2,8,83.7,83.17,,63.15,48.89
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [26]:
grades.loc[:, "Prefix"]

0     5
1     8
2     8
3     7
4     8
     ..
94    8
95    7
96    8
97    7
98    8
Name: Prefix, Length: 99, dtype: int64

In [27]:
grades.columns

Index(['Prefix', 'Assignment', 'Tutorial', 'Midterm', 'TakeHome', 'Final'], dtype='object')

In [29]:
grades.loc[3:7, "Final"]

3    80.56
4    73.89
5    68.06
6    50.00
7    56.11
Name: Final, dtype: float64

In [30]:
grades.iloc[3:5, :]

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
3,7,,,49.38,105.93,80.56
4,8,91.32,93.64,95.0,107.41,73.89


In [31]:
grades.loc[1:5, ["Prefix", "Final"]]

Unnamed: 0,Prefix,Final
1,8,68.33
2,8,48.89
3,7,80.56
4,8,73.89
5,7,68.06


In [41]:
grades.Tutorial.isna().sum()

3

In [45]:
grades.iloc[np.where(grades.Tutorial.isna())[0], :]

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
3,7,,,49.38,105.93,80.56
19,8,93.04,,79.38,83.33,91.11
75,7,93.83,,106.25,94.44,102.78


In [47]:
grades.dropna(axis = 0)

Unnamed: 0,Prefix,Assignment,Tutorial,Midterm,TakeHome,Final
0,5,57.14,34.09,64.38,51.48,52.50
1,8,95.05,105.49,67.50,99.07,68.33
4,8,91.32,93.64,95.00,107.41,73.89
5,7,95.00,92.58,93.12,97.78,68.06
6,8,95.05,102.99,56.25,99.07,50.00
...,...,...,...,...,...,...
92,7,95.60,82.28,76.88,108.33,78.33
93,8,87.52,91.58,56.25,71.85,85.00
96,8,89.94,102.77,87.50,90.74,87.78
97,7,95.60,76.13,66.25,99.81,85.56


In [48]:
grades.drop("Prefix", inplace = True, axis = 1)
grades

Unnamed: 0,Assignment,Tutorial,Midterm,TakeHome,Final
0,57.14,34.09,64.38,51.48,52.50
1,95.05,105.49,67.50,99.07,68.33
2,83.70,83.17,,63.15,48.89
3,,,49.38,105.93,80.56
4,91.32,93.64,95.00,107.41,73.89
...,...,...,...,...,...
94,,103.71,45.00,93.52,61.94
95,,80.54,41.25,93.70,39.72
96,89.94,102.77,87.50,90.74,87.78
97,95.60,76.13,66.25,99.81,85.56


In [62]:
grades.loc[:, "PostFix"] = range(99)

In [63]:
grades

Unnamed: 0,Assignment,Tutorial,Midterm,TakeHome,Final,PostFix
0,57.14,34.09,64.38,51.48,52.50,0
1,95.05,105.49,67.50,99.07,68.33,1
2,83.70,83.17,,63.15,48.89,2
3,,,49.38,105.93,80.56,3
4,91.32,93.64,95.00,107.41,73.89,4
...,...,...,...,...,...,...
94,,103.71,45.00,93.52,61.94,94
95,,80.54,41.25,93.70,39.72,95
96,89.94,102.77,87.50,90.74,87.78,96
97,95.60,76.13,66.25,99.81,85.56,97
