In [1]:
import pandas as pd

In [3]:
!wget https://www.cis.fordham.edu/wisdm/includes/datasets/latest/WISDM_ar_latest.tar.gz

--2025-03-25 14:11:12--  https://www.cis.fordham.edu/wisdm/includes/datasets/latest/WISDM_ar_latest.tar.gz
Resolving www.cis.fordham.edu (www.cis.fordham.edu)... 150.108.68.29
Connecting to www.cis.fordham.edu (www.cis.fordham.edu)|150.108.68.29|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 11404612 (11M) [application/x-gzip]
Saving to: ‘WISDM_ar_latest.tar.gz’


2025-03-25 14:11:12 (32.0 MB/s) - ‘WISDM_ar_latest.tar.gz’ saved [11404612/11404612]



In [4]:
!tar -xzvf /content/WISDM_ar_latest.tar.gz

WISDM_ar_v1.1/
WISDM_ar_v1.1/readme.txt
WISDM_ar_v1.1/WISDM_ar_v1.1_raw.txt
WISDM_ar_v1.1/WISDM_ar_v1.1_raw_about.txt
WISDM_ar_v1.1/WISDM_ar_v1.1_transformed.arff
WISDM_ar_v1.1/WISDM_ar_v1.1_trans_about.txt


In [None]:
import pandas as pd

# Define column names
columns = ['user', 'activity', 'timestamp', 'x-axis', 'y-axis', 'z-axis']

# Load dataset with proper handling of spaces and delimiters
df_har = pd.read_csv(
    'WISDM_ar_v1.1/WISDM_ar_v1.1_raw.txt',
    header=None,
    names=columns,
    sep=',',  # Ensure correct delimiter
    engine='python',  # Helps with inconsistent spacing issues
    skip_blank_lines=True,  # Ignore empty lines
    on_bad_lines="skip"  # Skip rows with incorrect column count (for Pandas 1.3+)
)

# Remove null values
df_har = df_har.dropna()

# Display the first few rows
print(df_har.head())


   user activity       timestamp    x-axis     y-axis        z-axis
0    33  Jogging  49105962326000 -0.694638  12.680544   0.50395286;
1    33  Jogging  49106062271000  5.012288  11.264028   0.95342433;
2    33  Jogging  49106112167000  4.903325  10.882658  -0.08172209;
3    33  Jogging  49106222305000 -0.612916  18.496431    3.0237172;
4    33  Jogging  49106332290000 -1.184970  12.108489     7.205164;


In [None]:
df_har.shape

(1086465, 6)

# Pandas Basics
**Most used functions:**
```
 head():  Returns the first n rows.
 tail():  Returns the last n rows.
 info():  Provides a summary of the DataFrame.
 describe():  Generates descriptive statistics.
 sort_values():  Sorts the DataFrame by specified columns.
 groupby():  Groups the DataFrame using a mapper or by series of columns.
 merge():  Merges DataFrame or named series objects with a database-style join.
 apply():  Applies a function along the axis of the DataFrame.
 drop():  Removes specified labels from rows or columns.
 pivot_table():  Creates a pivot table.
 fillna(): Fills NA/NaN values.
 isnull():  Detects missing values.
```

**Pandas data type:**
```
 int64  :  Integer values.
 float64  :  Floating-point values.
 object  :  Text or mixed types.
 datetime64[ns]  :  Date and time values.
 bool  :  Boolean values.
```

## Creating Dataframe
DataFrame can be created from **list**, **dictionary**, **pandas series**. We can set columns by passing `columns` paramaters.

In [5]:
# import pandas as pd
import pandas as pd
import numpy as np

# Calling DataFrame constructor
empty_df = pd.DataFrame()
print(f"empty dataframe:\n{empty_df}", end="\n")

lst = ['Geeks', 'For', 'Geeks', 'is',
			'portal', 'for', 'Geeks']
# Calling DataFrame constructor on list
from_list_df = pd.DataFrame(lst)
print(f"\nfrom list to dataframe:\n{from_list_df}", end="\n")

# From a dictionary of lists
df1 = pd.DataFrame({'A': [1, 2, 3], 'B': [4, 5, 6]})
# From a list of dictionaries
df2 = pd.DataFrame([{'A': 1, 'B': 4}, {'A': 2, 'B': 5}, {'A': 3, 'B': 6}])
# From a 2D NumPy array
df3 = pd.DataFrame(np.array([[1, 4], [2, 5], [3, 6]]), columns=['A', 'B'])
# From a series
df4 = pd.DataFrame({'A': pd.Series([1, 2, 3]), 'B': pd.Series([4, 5, 6])})

print("")
print(df1)
# print(df2)
# print(df3)
# print(df4)


empty dataframe:
Empty DataFrame
Columns: []
Index: []

from list to dataframe:
        0
0   Geeks
1     For
2   Geeks
3      is
4  portal
5     for
6   Geeks

   A  B
0  1  4
1  2  5
2  3  6


## Indexing & Accessing DataFrame
By default indexes are 0, 1, 2, ..., n-1 that shows most left row of the table. But we can set any column as index of the DataFrame. we can access, manupulate any data using the index.<br><br>
**some important function for index**
```
range_index = df.index     # return RangeIndex(start=0, stop=n-1, step=1) when default index
df = df.set_index('Name')    # shows index as corresponding name
df = df.reset_index()    # reset index and assign default indexes
item = df.loc[ith_index]    # access item information from ith_index. loc and iloc works same but iloc only for integer.
df.loc[["row1", "row2"], ["column1", "column2", "column3"]]    # access 'row1' and 'row2' and columns 1 to 3.
df.rename(index={i_index: 'new_row_name1', j_index: 'new_row_name2'}, inplace=True)    # change index name old_index:new_index. here inplace parameter forcefully changes self index, but False values doesn't current change but return another dataframe.
df.rename_axis("Index", inplace=True)    # assign new index column name
df.index.name = 'INDEX_NAME'     # Rename the index in place, alternative above
age_column = df['COLUMN_NAME']     # Access the 'COLUMN_NAME' column
filtered_data = df[df['COLUMN_NAME'] > 25] # Access rows where 'COLUMN_NAME' value is greater than 25
display(df.head(5))    # shows the first 5 data.


```

In [None]:
import pandas as pd

data = {'Name': ['John', 'Alice', 'Bob', 'Eve', 'Charlie'],
        'Age': [25, 30, 22, 35, 28],
        'Gender': ['Male', 'Female', 'Male', 'Female', 'Male'],
        'Salary': [50000, 55000, 40000, 70000, 48000]}

df = pd.DataFrame(data)
# print(df) # print dataframe
# print(df.index)  # Accessing the index

# # Set 'Name' column as the index
# df_with_index = df.set_index('Name') # set index as Name column
# print(df_with_index)

# df1 = df_with_index.reset_index() # reset index and get back default index (0 - (n-1))
# print(df1)

# print(df.iat[1, 2]) # access integer poistion (row, col) position value
# print(df.at[1, 'Age']) # access second row and 'Age' column value

# print(df.loc[2]) # access 3rd index information
# print(df.iloc[2]) # access 3rd index information
# subset = df.loc[[0, 1]] # access 1st and second index information
# print(subset)
# subset = df.loc[0:2, ['Name', 'Age']] # Access the first three rows and the 'Name' and 'Age' columns
# print(subset)
# all = df.loc[:,["Name", "Salary"]] # select all rows and only 'Name' and 'Salary' column
# print(all)
# subset = df.iloc[[1, 2], [2, 3]] # select only 2nd, 3rd row and only 3rd and 4th column.
# print(subset)
# slicing = df.loc[index1:index3] # this index can be string or integer

# filtered_data = df[df['Age'] > 25] # Access rows where 'Age' is greater than 25
# print(filtered_data)

# salary_at_index_2 = df.at[2, 'Salary'] # Access the 'Salary' of the row with label 2
# print(salary_at_index_2)



# # df.rename(index={0: 'Row1', 1:'Row2'}, inplace=True) # change index name, here inplace parameter forcefully change current df
# # print(df)

# df.rename_axis("Index", inplace=True) # assign new index column name
# print(df)


# # Access the 'Age' column
# age_column = df['Age']
# print(age_column)


30


## Filtering Technique
DataFrame can be filtered using single or multiple condition. Dataframe is a very essential concept in Python and filtration of data is required can be performed based on various conditions. They can be achieved in any one of the above ways. Points to be noted:

* `loc` works with column labels and indexes.
* `eval` and `query` works only with columns.
* `Boolean` indexing works with values in a column only.

Some operator we should keep in mind:
* `&` is called **and** operator, must both value is true
* `|` is called **or** operator, any of them is true
* `~` is called **not** operator, inverse value

In [6]:
# import module
import pandas as pd

# assign data
df = pd.DataFrame({'Name': [' RACHEL  ', ' MONICA  ', ' PHOEBE  ',
                                   '  ROSS    ', 'CHANDLER', ' JOEY    '],
                          'Age': [30, 35, 37, 33, 34, 30],
                          'Salary': [100000, 93000, 88000, 120000, 94000, 95000],
                          'JOB': ['DESIGNER', 'CHEF', 'MASUS', 'PALENTOLOGY',
                                  'IT', 'ARTIST']})

# display(dataFrame) # display dataframe

# display(df.loc[(df['Salary']>=100000) & (df['Age']< 40) & (df['JOB'].str.startswith('D')),['Name','JOB']])  # filter dataframe using boolean index

# filtered_values = np.where((df['Salary']>=100000) & (df['Age']< 40) & (df['JOB'].str.startswith('D'))) # filter dataframe using numpy
# print(filtered_values)
# display(df.loc[filtered_values])

# display(df.query('Salary  <= 100000 & Age < 40 & JOB.str.startswith("C").values')) # df.query() functiion filtered with condition. Condition must be string type

# display(df[df.eval("Salary <=100000 & (Age <40) & JOB.str.startswith('A').values")]) # df.eval() function same wors as df.query().

# ages = [30, 25]
# filtered_df = df[df['Age'].isin(ages)] # shows the Age value is in the ages list
# print(filtered_df)



Unnamed: 0,Name,JOB
0,RACHEL,DESIGNER


## Sorting Pandas
Pandas provides a powerful method called `sort_values()` that allows to sort the DataFrame based on one or more columns. <br>

* Pandas allows you to specify the sorting algorithm using the `kind` parameter. The available parameter `quicksort`, `mergesort`, `heapsort`.
* can be added lamda expression to manupulate sorting

In [19]:
import pandas as pd
data = {'Name': ['Alice', 'Kawsar', 'Bob', 'Charlie', 'David'],
        'Age': [25, 30, None, 30, 40, ],
        'Score': [85, 90, 27, 95, 80]}
df = pd.DataFrame(data)

# Sorting by 'Age' in ascending order
# sorted_df = df.sort_values(by='Age')    # sorting by ascending order, here ascending default true and based on 'Age' column
# sorted_df = df.sort_values(by='Age', ascending = False)    # sorting by descending order, based on 'Age' column
# sorted_df = df.sort_values(["Score", "Age"])    # sorting by multiple columns
# sorted_df = df.sort_values(by='Age', na_position="last")    # handling missing value by giving na_position='last' or 'first'
# sorted_df = df.sort_values(by='Age', kind='mergesort') # keeping previous order when same value comes. kind parameter can be (heapsort, quicksort) as well.
# sorted_df = df.sort_values(by='Name', key=lambda col: col.str.lower()) # sort string by ignoring key sensitivity by providing key lamda expression

# print(sorted_df)


      Name   Age  Score
0    Alice  25.0     85
2      Bob   NaN     27
3  Charlie  30.0     95
4    David  40.0     80
1   Kawsar  30.0     90


## Merging, Joining, Concatening
see for details: https://www.geeksforgeeks.org/python-pandas-merging-joining-and-concatenating/

```
pd.concate()
  parameters:
    axis=0 (default) represent concatenate one after another
    axis=1 represent concatenate side by side.
    sort -> value will be sorting or not
    

```

In [32]:
# importing pandas module
import pandas as pd

# Define a dictionary containing employee data
data1 = {'Name': ['Jai', 'Princi', 'Gaurav', 'Anuj'],
         'Age': [27, 24, 22, 32],
         'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
         'Qualification': ['Msc', 'MA', 'MCA', 'Phd']}

# Define a dictionary containing employee data
data2 = {'Name': ['Abhi', 'Ayushi', 'Dhiraj', 'Hitesh'],
         'Age': [17, 14, 12, 52],
         'Address': ['Nagpur', 'Kanpur', 'Allahabad', 'Kannuaj'],
         'Qualification': ['Btech', 'B.A', 'Bcom', 'B.hons'],
         'Gender': ['Male', 'Femal', 'Male', 'Femal']}

# Convert the dictionary into DataFrame
df = pd.DataFrame(data1, index=[0, 1, 2, 3])
df1 = pd.DataFrame(data2, index=[1, 5, 0, 7])

# using a .concat() method
# res = pd.concat([df, df1]) # concateing using list and concat() function
# res = pd.concat([df, df1], axis=1, join='inner') # inner join -> only take common value based on indexs
# res = pd.concat([df, df1], axis=1, sort=False) # inner join -> only take common value based on indexs


print(df, "\n\n", df1)
print("After concatening\n\n")
print(res)


     Name  Age    Address Qualification
0     Jai   27     Nagpur           Msc
1  Princi   24     Kanpur            MA
2  Gaurav   22  Allahabad           MCA
3    Anuj   32    Kannuaj           Phd 

      Name  Age    Address Qualification Gender
1    Abhi   17     Nagpur         Btech   Male
5  Ayushi   14     Kanpur           B.A  Femal
0  Dhiraj   12  Allahabad          Bcom   Male
7  Hitesh   52    Kannuaj        B.hons  Femal
After concatening


     Name   Age    Address Qualification    Name   Age    Address  \
0     Jai  27.0     Nagpur           Msc  Dhiraj  12.0  Allahabad   
1  Princi  24.0     Kanpur            MA    Abhi  17.0     Nagpur   
2  Gaurav  22.0  Allahabad           MCA     NaN   NaN        NaN   
3    Anuj  32.0    Kannuaj           Phd     NaN   NaN        NaN   
5     NaN   NaN        NaN           NaN  Ayushi  14.0     Kanpur   
7     NaN   NaN        NaN           NaN  Hitesh  52.0    Kannuaj   

  Qualification Gender  
0          Bcom   Male  
1       

## Pivot Table

## Files (csv, text, xlsx)
```
data = pd.read_csv("nba.csv", index_col ="Name")    # making data frame from csv file, making 'Name' column as index


```