In [1]:
import pandas as pd
import numpy as np

In [2]:
# creating a series that automatically has index
pdSeries = pd.Series([1,2,np.nan,"A" ])
pdSeries

0      1
1      2
2    NaN
3      A
dtype: object

In [20]:
# date range periods => dividing range into equal periods
# frequency is the frequency of values 'D' for days..
dateRangePeriod = pd.date_range(start="06/07/2023", end="10/10/2023",periods=10)
dateRangeFreq = pd.date_range(start="06/07/2023", end="07/10/2023",
                              freq='3D')
dateRangeFreq

DatetimeIndex(['2023-06-07', '2023-06-10', '2023-06-13', '2023-06-16',
               '2023-06-19', '2023-06-22', '2023-06-25', '2023-06-28',
               '2023-07-01', '2023-07-04', '2023-07-07', '2023-07-10'],
              dtype='datetime64[ns]', freq='3D')

## Dataframe Creation

### Creating DF from Numpy

In [54]:
numpyArray1 = [['Islam','PROG8245',40],
               ['Islam', 'PROG8060',30],
               ['Ran', 'CSCN8010',39]]
dfFromNumpy = pd.DataFrame(numpyArray1, columns=
                           ['Name', 'Course', 'NumberOfStudents'],
                             index=[11,12,13])
dfFromNumpy

Unnamed: 0,Name,Course,NumberOfStudents
11,Islam,PROG8245,40
12,Islam,PROG8060,30
13,Ran,CSCN8010,39


### Creating DF from Dictionary

In [33]:
dictionaryDF = {'Name':['Islam','Mahmoud', 'Ran'],
                'Course':['PROG8245', 'PROG8060', 'CSCN8010'],
                'NumberOfStudents':[40,30,39]}
dfFromDictionary = pd.DataFrame(dictionaryDF, index=[11,12,13])
dfFromDictionary

Unnamed: 0,Name,Course,NumberOfStudents
11,Islam,PROG8245,40
12,Mahmoud,PROG8060,30
13,Ran,CSCN8010,39


In [36]:
# shows the top (x) values
dfFromDictionary.head(2)

Unnamed: 0,Name,Course,NumberOfStudents
11,Islam,PROG8245,40
12,Mahmoud,PROG8060,30


In [40]:
# shows the last (x) values
dfFromNumpy.tail(2)

Unnamed: 0,Name,Course,NumberOfStudents
12,Mahmoud,PROG8060,30


In [41]:
# accessing df as regular list
dfFromNumpy[1:2]

Unnamed: 0,Name,Course,NumberOfStudents
12,Mahmoud,PROG8060,30


### Dataframe functions

In [47]:
# knowing dataframe index values
print(np.array(dfFromNumpy.index))
# dataframe columns
print(np.array(dfFromNumpy.columns))

[11 12 13]
['Name' 'Course' 'NumberOfStudents']


In [48]:
# retrieving dataframe as numpy array without column names
dfFromNumpy.to_numpy()

array([['Islam', 'PROG8245', 40],
       ['Mahmoud', 'PROG8060', 30],
       ['Ran', 'CSCN8010', 39]], dtype=object)

In [51]:
# retrieving dataframe as dictionary(includes column names and index)
dfFromNumpy.to_dict()

{'Name': {11: 'Islam', 12: 'Mahmoud', 13: 'Ran'},
 'Course': {11: 'PROG8245', 12: 'PROG8060', 13: 'CSCN8010'},
 'NumberOfStudents': {11: 40, 12: 30, 13: 39}}

In [57]:
# showing statistical values
dfFromNumpy.describe(include='all')

Unnamed: 0,Name,Course,NumberOfStudents
count,3,3,3.0
unique,2,3,
top,Islam,PROG8245,
freq,2,1,
mean,,,36.333333
std,,,5.507571
min,,,30.0
25%,,,34.5
50%,,,39.0
75%,,,39.5


In [88]:
# dataframe maximum for each column
print("---- Dataframe Maximum ----")

print(dfFromNumpy.max())
# dataframe minimum for each column
print("---- Dataframe Minimum ----")
print(dfFromNumpy.min())
# column maximum index (argmin for minimum)
print(f'Index of maximum {dfFromNumpy["NumberOfStudents"].argmax()}')

---- Dataframe Maximum ----
Name                     Ran
Course              PROG8245
NumberOfStudents          40
dtype: object
---- Dataframe Minimum ----
Name                   Islam
Course              CSCN8010
NumberOfStudents          30
dtype: object
Index of maximum 1


In [100]:
# iloc column/row index,extracting first two rows, last 2 columns
dfFromNumpy.iloc[:2,-2:]
# loc takes the index values
dfFromNumpy.loc[[11,12]]

Unnamed: 0,Name,Course,NumberOfStudents
11,Islam,PROG8245,40
12,Islam,PROG8060,30


In [62]:
# retrieving column Name from dataframe
# dfFromNumpy.Name
dfFromNumpy['Name']

11    Islam
12    Islam
13      Ran
Name: Name, dtype: object

In [63]:
# transposing dataframe (rows to columns and vice versa)
dfFromNumpy.T

Unnamed: 0,11,12,13
Name,Islam,Islam,Ran
Course,PROG8245,PROG8060,CSCN8010
NumberOfStudents,40,30,39


In [76]:
# filtering on name
filterName = dfFromNumpy['Name']=='Islam'
# applying the filter
dfFromNumpy[filterName]

Unnamed: 0,Name,Course,NumberOfStudents
11,Islam,PROG8245,40
12,Islam,PROG8060,30


### Combining 2 Dataframes

In [128]:
# concatenating two dataframes
dfConcat=pd.concat([dfFromNumpy, dfFromDictionary])

In [131]:
# grouping by column names
dfConcat.groupby(['Name','Course']).apply(print)

     Name    Course  NumberOfStudents
12  Islam  PROG8060                30
     Name    Course  NumberOfStudents
11  Islam  PROG8245                40
11  Islam  PROG8245                40
       Name    Course  NumberOfStudents
12  Mahmoud  PROG8060                30
   Name    Course  NumberOfStudents
13  Ran  CSCN8010                39
13  Ran  CSCN8010                39


### Loading CSV Example

In [5]:
dataframeFromCSV = pd.read_csv('pandas_tutorial_read.csv', 
delimiter=';', names=['my_datetime', 'event', 
'country', 'user_id', 'source', 'topic']
)
dataframeFromCSV.head(10)

Unnamed: 0,my_datetime,event,country,user_id,source,topic
0,2018-01-01 00:01:01,read,country_7,2458151261,SEO,North America
1,2018-01-01 00:03:20,read,country_7,2458151262,SEO,South America
2,2018-01-01 00:04:01,read,country_7,2458151263,AdWords,Africa
3,2018-01-01 00:04:02,read,country_7,2458151264,AdWords,Europe
4,2018-01-01 00:05:03,read,country_8,2458151265,Reddit,North America
5,2018-01-01 00:05:42,read,country_6,2458151266,Reddit,North America
6,2018-01-01 00:06:06,read,country_2,2458151267,Reddit,Europe
7,2018-01-01 00:06:15,read,country_6,2458151268,AdWords,Europe
8,2018-01-01 00:07:21,read,country_7,2458151269,AdWords,North America
9,2018-01-01 00:07:29,read,country_5,2458151270,Reddit,North America


In [11]:
filter1 =dataframeFromCSV['country']=='country_2'
filter2 =dataframeFromCSV['topic']=='Asia'
columnNames = ['my_datetime', 'user_id', 'event']
dataframeFromCSV[filter1 & filter2][columnNames]

# dataframeFromCSV[(dataframeFromCSV['country']=='country_2') & (dataframeFromCSV['topic']=='Asia')][['my_datetime', 'user_id', 'event']]

Unnamed: 0,my_datetime,user_id,event
17,2018-01-01 00:13:06,2458151278,read
19,2018-01-01 00:14:53,2458151280,read
20,2018-01-01 00:15:44,2458151281,read
23,2018-01-01 00:17:31,2458151284,read
28,2018-01-01 00:23:16,2458151289,read
...,...,...,...
1775,2018-01-01 23:47:19,2458153036,read
1778,2018-01-01 23:51:17,2458153039,read
1781,2018-01-01 23:51:52,2458153042,read
1782,2018-01-01 23:53:03,2458153043,read
