<a href="https://colab.research.google.com/github/alexlinapp/python_tools_practice/blob/main/data_manipulation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [46]:
import torch

In [2]:
x = torch.arange(12, dtype=torch.float32)
# x.view(3,4) returns a VIEW, not a copy, changing y will also change x
y = x.view(3,4)
y[1][2] = 5
y,x
# x.reshape(3,4) tries to return a VIEW. If it cannot (as not contigious [contigious here means contigious ]), will return a copy
# A tensor is contiguous if its elements are stored in a single, continuous block of memory and its strides follow the standard pattern for the tensor’s shape, meaning:
# The stride of the last dimension is 1.
# The stride of any other dimension i equals the product of the sizes of all dimensions after i.
# This ensures that moving along any dimension corresponds to stepping through memory in a predictable, sequential way.
# A stride is a tuple of integers that tells how many elements in memory you need to skip to move to the next element along each dimension of a tensor.
# For a tensor with shape (d0, d1, d2, ..., dn), stride[i] is how far (in elements) you jump in memory to move by one step in dimension i.
z = x.reshape(3,4).T
z,x,z.is_contiguous(),z.stride(),x.stride()

(tensor([[ 0.,  4.,  8.],
         [ 1.,  5.,  9.],
         [ 2.,  5., 10.],
         [ 3.,  7., 11.]]),
 tensor([ 0.,  1.,  2.,  3.,  4.,  5.,  5.,  7.,  8.,  9., 10., 11.]),
 False,
 (1, 4),
 (1,))

In [19]:
# use [:] to assign whole value to tensor without reallocating
x = torch.arange(12).reshape(3,4)
y = torch.arange(12).reshape(3,4)
Z = torch.zeros_like(y) # creates new tensor Z, allocates new memory for it
old = Z
Z[:] = x + y
print(id(Z), id(old), id(Z) == id(old))


135822506059408 135822506059408 True


In [16]:
import numpy as np
import pandas as pd

In [24]:
s = pd.Series([1,3,5,np.nan,6,8])
s
dates = pd.date_range("20250701", periods=6)
dates
df = pd.DataFrame(np.random.randn(6,4), index=dates, columns=list("ABCD"))
df


Unnamed: 0,A,B,C,D
2025-07-01,0.031812,-0.5997,1.802725,-2.177538
2025-07-02,0.410659,0.430892,0.859719,-0.289493
2025-07-03,-1.11398,-0.551872,0.251153,-0.442202
2025-07-04,0.493147,0.841402,-1.714584,-0.675644
2025-07-05,1.585147,0.674296,0.11775,0.23111
2025-07-06,0.262585,-0.534437,0.684366,2.101041


In [94]:
import os

os.makedirs(os.path.join('..', 'data'), exist_ok=True)
data_file = os.path.join('..', 'data', 'house_tiny.csv')
with open(data_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')

test_file = os.path.join('.', 'test.csv')

with open(test_file, 'w') as f:
    f.write('''NumRooms,RoofType,Price
NA,NA,127500
2,NA,106000
4,Slate,178100
NA,NA,140000''')
data = pd.read_csv(test_file)
print(data)
inputs, targets = data.iloc[:,0:2], data.iloc[:, 2]
# data_replaed = data.fillna(data.mean())
# print(data_replaed)
inputs = pd.get_dummies(inputs, dummy_na=True)
print(inputs)
X = torch.tensor(inputs.to_numpy(dtype=float))
# y = torch.tensor(targets.values)
print(X)

   NumRooms RoofType   Price
0       NaN      NaN  127500
1       2.0      NaN  106000
2       4.0    Slate  178100
3       NaN      NaN  140000
   NumRooms  RoofType_Slate  RoofType_nan
0       NaN           False          True
1       2.0           False          True
2       4.0            True         False
3       NaN           False          True
tensor([[nan, 0., 1.],
        [2., 0., 1.],
        [4., 1., 0.],
        [nan, 0., 1.]], dtype=torch.float64)


In [101]:
import zipfile

with zipfile.ZipFile("iris.zip", "r") as zip_ref:
  print(zip_ref.namelist())
  with zip_ref.open("iris.data", "r") as file:
    print(file)
    data = pd.read_csv(file)
    #print(data.head())
    inputs, outputs = data.iloc[:, 0:4], data.iloc[:, 4]
    inputs = pd.get_dummies(inputs, dummy_na=True)
    inputs = torch.tensor(inputs.to_numpy(dtype=float))
    codes, uniques = pd.factorize(outputs)
    print(codes, uniques)


['Index', 'bezdekIris.data', 'iris.data', 'iris.names']
<zipfile.ZipExtFile name='iris.data' mode='r' compress_type=deflate>
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2] Index(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype='object')


In [138]:
# autograd

# two ways to initialize tensors to calculate gradients
# method 1:
x1 = torch.arange(5.0, requires_grad=True)
# method 2:
x2 = torch.arange(5, dtype=float)
x2.requires_grad_(True)


# When calling backward(), pytroch frees computation graph but does not set .grad to 0
y1 = 2 * torch.dot(x1, x1)
y1.backward()
print("x1 grad initial:", x1.grad)
# gradients accumalte, use x1.grad.zero_() to reset
y1 = 3 * torch.dot(x1, x1)
y1.backward()
print("x1 grad final:", x1.grad)


y2 = x2 * x2
# returns tensor that uses same memory allocation but detached from computational graph
u2 = y2.detach()
z2 = (x2 * u2).sum()
z2.backward()
print("This is u2:", u2)
print("x2 grad on z2.backward():", x2.grad, "\nEqual to u2?", x2.grad == u2)
x2.grad.zero_()
y2.sum().backward()
print("x2 grad on y2.backward():", x2.grad)
# reset gradients


x1 grad initial: tensor([ 0.,  4.,  8., 12., 16.])
x1 grad final: tensor([ 0., 10., 20., 30., 40.])
This is u2: tensor([ 0.,  1.,  4.,  9., 16.], dtype=torch.float64)
x2 grad on z2.backward(): tensor([ 0.,  1.,  4.,  9., 16.], dtype=torch.float64) 
Equal to u2? tensor([True, True, True, True, True])
x2 grad on y2.backward(): tensor([0., 2., 4., 6., 8.], dtype=torch.float64)
