In [2]:
import numpy as np
import pandas as pd
import geopandas as gpd
import os
import shapely

# pandas

In [2]:
data = {'Name': ['A', 'B', 'C'],
        'Age': [25, 30, 25],
        'Country': ['SN', 'MY', 'TH']
        }
df = pd.DataFrame(data)

In [3]:
ages = pd.Series([25, 30, 25], name='Age')

In [4]:
df.to_csv('./trial/data.csv')
df_read = pd.read_csv('./trial/data.csv')

In [5]:
df.to_pickle('./trial/data.pkl')
df_read = pd.read_pickle('./trial/data.pkl')

df.to_parquet('./trial/data.pqt')
df_read = pd.read_parquet('./trial/data.pqt')

manipulating data

In [11]:
df.groupby('Country')['Age'].mean()

Country
MY    30.0
SN    25.0
TH    25.0
Name: Age, dtype: float64

In [25]:
df1 = df.iloc[:2]
df2 = df.iloc[1:]
df3 = pd.concat([df1, df2])
df4 = pd. merge(df1, df2, how="left", on="Name")

other example on **data binning**

In [26]:
# Sample age data
ages = [20, 22, 25, 27, 21, 29, 31, 35, 37, 40, 38]

# Create bins
bins = [18, 25, 35, 50]

# Use cut
age_groups = pd.cut(ages, bins)

In [27]:
quantile_age_groups = pd.qcut(ages, 4)

_random data_

In [33]:
data = np.random.randn(10)
series = pd.Series(data)

rolling_window = series.rolling(window=3)
moving_average = rolling_window.mean()
moving_average

0         NaN
1         NaN
2   -0.845040
3   -0.531735
4    0.289752
5    0.713448
6    0.738497
7    0.618665
8    0.375221
9    0.465233
dtype: float64

In [32]:
expanding_window = series.expanding()
cumulative_sum = expanding_window.sum()
cumulative_sum

0    0.684621
1    1.059548
2    0.654011
3    0.961579
4    1.234506
5    1.723379
6    1.863458
7    1.360538
8    1.603630
9    2.510081
dtype: float64

try time series

In [43]:
data = {'Date': ['2021-01-01', '2021-01-02', '2021-01-03'],
        'Value': [100, 200, 300]}
df = pd.DataFrame(data)

df['Date'] = pd.to_datetime(df['Date'])

In [44]:
df = df.set_index('Date')
monthly_resampled_data = df.resample('ME').mean()

In [45]:
monthly_data = df.asfreq('ME')

In [47]:
dates = pd.date_range('20210101', periods=6)
df = pd.DataFrame({'Value': [100, 200, 150, 175, 200, 220]}, index=dates)

# shift (forecast) by 3 days
df['Lagged_Value'] = df['Value'].shift(periods=3, freq="D")
df

Unnamed: 0,Value,Lagged_Value
2021-01-01,100,
2021-01-02,200,
2021-01-03,150,
2021-01-04,175,100.0
2021-01-05,200,200.0
2021-01-06,220,150.0


categorical data

In [50]:
df = pd.DataFrame({
    'Grade' : ['A', 'B', 'A', 'C', 'B', 'A', 'D', 'A', 'C', 'B']
})
df['Grade'] = df['Grade'].astype('category')

In [53]:
# binning age into categories
ages = pd.Series([20, 22, 25, 27, 21, 29, 31, 35, 37, 40, 38])
bins = [18, 25, 35, 50]
age_categories = pd.cut(ages, bins)

0     (18, 25]
1     (18, 25]
2     (18, 25]
3     (25, 35]
4     (18, 25]
5     (25, 35]
6     (25, 35]
7     (25, 35]
8     (35, 50]
9     (35, 50]
10    (35, 50]
dtype: category
Categories (3, interval[int64, right]): [(18, 25] < (25, 35] < (35, 50]]

In [55]:
unique_grades = df['Grade'].unique()

['A', 'B', 'C', 'D']
Categories (4, object): ['A', 'B', 'C', 'D']

In [58]:
grade_counts = df['Grade'].value_counts()

In [59]:
unique_grades_count = df['Grade'].nunique()

# Afghanistan Data Training -- Checking

In [1]:
import geopandas as gpd

In [2]:
training = './afg/Training.shp'

df_training = gpd.read_file(training)

for column in df_training.columns:
    unique_values = df_training[column].unique()
    print(f"Unique value in column '{column}: {unique_values}'")

Unique value in column 'OBJECTID: [   3    4    5 ... 5662 5666 5674]'
Unique value in column 'Premitives: ['BRS' 'BSD' 'URB' 'AGI' 'WAT' 'AGV' 'AGR' 'NHS' 'NFS' 'AGT' 'Marshland'
 'SNW']'
Unique value in column 'year: [2018 2017 2016 2015 2014 2013 2011 2010 2009 2008 2003    0]'
Unique value in column 'Sub: ['Soil' 'Dunes' 'Sand' 'Rock' 'URB' 'AGI' 'WAT' 'AGV' 'AGR' 'NHS' 'Shrubs'
 'AGT' 'Pesta' 'Open' 'RiverBed' 'S_Marsh' 'P_Marsh' 'SNW' 'Closed' None]'
Unique value in column 'PLOTID: [    1     2     3 ... 43333 43334     0]'
Unique value in column 'Remarks: [None '1' '0' '9' 'C' '00' 'FAO WAT Look BRS' 'AGV/AGT' 'c']'
Unique value in column 'landcover_: [ 7 11  0  2  9  3  4  6  5  1  8 10]'
Unique value in column 'geometry: <GeometryArray>
[<POINT (64.111 29.428)>,   <POINT (62.416 29.4)>, <POINT (62.467 29.403)>,
 <POINT (62.519 29.405)>,  <POINT (62.57 29.408)>, <POINT (62.775 29.417)>,
  <POINT (62.826 29.42)>, <POINT (63.749 29.459)>, <POINT (63.801 29.461)>,
 <POINT (63.903 

In [3]:
df_training.sort_values(by=['OBJECTID'])

Unnamed: 0,OBJECTID,Premitives,year,Sub,PLOTID,Remarks,landcover_,geometry
41412,0,AGI,0,,0,,2,POINT (69.46464 35.26064)
31137,0,NHS,2018,NHS,29126,,6,POINT (68.25570 33.79920)
31138,0,BRS,2018,Soil,29127,,7,POINT (68.32380 33.80290)
31139,0,BRS,2018,Soil,29128,,7,POINT (68.64170 33.81970)
31140,0,BRS,2018,Soil,29129,,7,POINT (68.93720 33.83470)
...,...,...,...,...,...,...,...,...
7474,24276,URB,2018,URB,15843,,0,POINT (66.04243 31.96510)
7475,24277,NHS,2018,NHS,15844,,6,POINT (63.99532 32.66639)
21566,24278,NHS,2018,NHS,15845,,6,POINT (67.52406 35.10234)
21567,24279,BRS,2018,Rock,15846,,7,POINT (70.77647 35.81953)


In [4]:
df_training = df_training.drop_duplicates(subset=['Premitives'])
df_training

Unnamed: 0,OBJECTID,Premitives,year,Sub,PLOTID,Remarks,landcover_,geometry
0,3,BRS,2018,Soil,1,,7,POINT (64.11093 29.42818)
3,6,BSD,2018,Dunes,4,,11,POINT (62.51852 29.40501)
62,104,URB,2018,URB,63,,0,POINT (64.36164 29.57232)
147,253,AGI,2018,AGI,150,,2,POINT (65.69787 29.70252)
889,1375,WAT,2018,WAT,901,,9,POINT (62.93083 30.23444)
1021,1598,AGV,2018,AGV,1034,,3,POINT (63.33403 30.43261)
1398,2303,AGR,2018,AGR,1428,,4,POINT (66.28647 30.84331)
1512,2478,NHS,2018,NHS,1545,,6,POINT (61.80094 30.80626)
1736,2855,NFS,2018,Shrubs,1782,,5,POINT (67.58699 31.27002)
2353,3983,AGT,2018,AGT,2420,,1,POINT (67.57882 31.81120)


In [5]:
df_training.sort_values(by=['landcover_'])

Unnamed: 0,OBJECTID,Premitives,year,Sub,PLOTID,Remarks,landcover_,geometry
62,104,URB,2018,URB,63,,0,POINT (64.36164 29.57232)
2353,3983,AGT,2018,AGT,2420,,1,POINT (67.57882 31.81120)
147,253,AGI,2018,AGI,150,,2,POINT (65.69787 29.70252)
1021,1598,AGV,2018,AGV,1034,,3,POINT (63.33403 30.43261)
1398,2303,AGR,2018,AGR,1428,,4,POINT (66.28647 30.84331)
1736,2855,NFS,2018,Shrubs,1782,,5,POINT (67.58699 31.27002)
1512,2478,NHS,2018,NHS,1545,,6,POINT (61.80094 30.80626)
0,3,BRS,2018,Soil,1,,7,POINT (64.11093 29.42818)
10930,0,Marshland,2018,S_Marsh,31246,,8,POINT (61.31800 29.91120)
889,1375,WAT,2018,WAT,901,,9,POINT (62.93083 30.23444)
