In [1]:
#settings
learning_rate = 0.0001
batch_size = 64
data_loader_batch_size = 5120
epochs = 20

newDatasetDir = 'D:\\data\\reformedUserData'
import sqlite3
import torch 
device = torch.device("cpu")
if torch.cuda.is_available():
    print("has cuda")
    device = torch.device("cuda")
mainConnection = sqlite3.connect(newDatasetDir)
torch.set_printoptions(threshold=10_000)
#gameInteractions( row_id TEXT PRIMARY KEY, steam_id TEXT, app_id TEXT, playtime_2weeks INTEGER, playtime_forever INTEGER, is_recommended INTEGER)
cur2 = mainConnection.cursor()
print("pulling classifications")
dataPullClassificationList = cur2.execute("SELECT count FROM classifications")
classificationCounts = dataPullClassificationList.fetchall()
classificationsLen = len(classificationCounts)
print("classification count:",classificationsLen)
classifications = list(map(lambda clasif: clasif[0],classificationCounts))
print("pulling game interactions")
dataPullGameInteractions = cur2.execute("SELECT steam_id,app_id,score FROM gameInteractions")
gameInteractions = dataPullGameInteractions.fetchall()

mainConnection.close()
#creating an array that's index matches account id and value is the start index of that steam ID to not loop 91 million more than once
print("indexing gameInteractions")
accountIndexList = [0]
currentAccountIndex = 0
for key,game in enumerate(gameInteractions):
    if( game[0] != currentAccountIndex):
        accountIndexList.append(key)
        currentAccountIndex+=1
print("finished indexing gameInteractions")
populationPenaltyWeights = torch.FloatTensor(list(map(lambda clasif: 1-(clasif/len(accountIndexList)),classifications))).cuda()
gameIntLen = len(gameInteractions)
#times will be based on a per steam id basis




has cuda
pulling classifications
classification count: 9800
pulling game interactions
indexing gameInteractions
finished indexing gameInteractions


In [2]:
import random
class CustomGameDataset():
    def __init__(self,reformedDataDir,startProfileIndex,endProfileIndex):
        print("init called")
        #map of game indexs to game ids
        self.classifications = self.getClassification(reformedDataDir)
        #file location compared to python
        self.reformedDataDir = reformedDataDir
        #so that the datasets can be ripped apart for testing and training
        self.startProfileIndex = startProfileIndex
        self.endProfileIndex = endProfileIndex

    def __len__(self):
        #index on profiles start at 0
        return self.endProfileIndex - self.startProfileIndex +1
    def getClassification(self,reformedDataDir):
        classCon = sqlite3.connect(reformedDataDir)
        classCur = classCon.cursor()
        classCur.execute("SELECT * FROM classifications")
        classifications = {}
        tempClassifications = classCur.fetchall()
        index =0
        for classif in tempClassifications:
            classifications[index] = classif[0]
            index +=1
        return classifications 
    def __getitem__(self, idx):
        currentIndex = idx + self.startProfileIndex
        steamIDIndex = accountIndexList[currentIndex]
        gameList = []
        for x in  range(steamIDIndex,gameIntLen):
            if gameInteractions[x][0] == currentIndex:
                gameList.append([gameInteractions[x][1],gameInteractions[x][2]])
            else:
                break
        dataList = [0] * classificationsLen
        targetList = [0] * classificationsLen
        sampleSize = int(len(gameList) * .75)
        if sampleSize == 0:
            sampleSize = 1
        sampleGameList = random.sample(gameList,sampleSize)
        for game in gameList:
            targetList[game[0]] = game[1]
        for game in sampleGameList:
            dataList[game[0]] = game[1]
        return [torch.FloatTensor(dataList).cuda(), torch.FloatTensor(targetList).cuda()]


In [3]:
from torch import nn
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_hardtanh_stack = nn.Sequential(
            nn.Linear(classificationsLen, classificationsLen),
            nn.Hardtanh(min_val=-3,max_val=12),
            nn.Linear(classificationsLen, classificationsLen),
            nn.Hardtanh(min_val=-3,max_val=12),
            nn.Linear(classificationsLen, classificationsLen),
        )

    def forward(self, x):
        logits = self.linear_hardtanh_stack(x)
        return logits

In [4]:
def train_loop(dataloader, model, loss_fn, optimizer):
    num_batches = len(dataloader)
    print("test")
    # Set the model to training mode - important for batch normalization and dropout layers
    # Unnecessary in this situation but added for best practices
    print("setting modle .train")
    model.train()
    print("finished setting modle .train")
    for batch, sampleSet in enumerate(dataloader):
        if(batch %10 ==0):
                print("batch: ",(batch+1),"/",num_batches)
        # Compute prediction and loss
        inputTotal = torch.zeros(classificationsLen).cuda()
        targetTotal = torch.zeros(classificationsLen).cuda()
        #getting averages of set so that more games will be represented(can be weighted later)
        for sample in sampleSet:
            input, target = sample[0], sample[1]
            inputTotal = inputTotal + input
            targetTotal = targetTotal + target
        dataMean = inputTotal.div(batch_size)
        targetMean = targetTotal.div(batch_size)
        pred = model(dataMean)
        loss = loss_fn(pred, targetMean)
        # Backpropagation
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

In [5]:


def test_loop(dataloader, model, loss_fn):
    model.eval()
    num_batches = len(dataloader)
    test_loss  = 0
    with torch.no_grad():
        for batch, sampleSet in enumerate(dataloader):
            if(batch %10 ==0):
                print("batch: ",(batch+1),"/",num_batches)
            inputTotal = torch.zeros(classificationsLen).cuda()
            targetTotal = torch.zeros(classificationsLen).cuda()
            for sample in sampleSet:
                input, target = sample[0], sample[1]
                inputTotal = inputTotal + input
                targetTotal = targetTotal + target
            dataMean = inputTotal.div(batch_size)
            targetMean = targetTotal.div(batch_size)
            pred = model(dataMean)
            
            test_loss += loss_fn(pred, targetMean).item()
    test_loss /= num_batches
    print(f"Test Error: Avg loss: {test_loss:>8f} \n")

In [6]:
def custom_loss(pred, target):
    #nonZeroCount = torch.nonzero(pred).size(dim=1)
    #basic mseloss function with penalty on weights of 1-percentageAppearance
    return (((pred - target) * populationPenaltyWeights)**2).mean()
    

In [7]:
# the actual running network
from torch.utils.data import  DataLoader
model = NeuralNetwork().to(device)
#model = torch.load('model.pth')
print(model)
#subset between 70/80%
#find the index of 75% then backtrack through indexs until you find a new steam id
#in order to not split an account between the both
print("looking for 75 percent index")
index75Percent = int(gameIntLen*.75)
steamIDIndexAtPercent = gameInteractions[index75Percent][0]
steamIDIndexAtEnd = gameInteractions[gameIntLen-1][0]
print("found index at ",index75Percent)
print("feeding train dataset")
#collate used because lists have diffrent dimensions
train_dl = DataLoader(CustomGameDataset(newDatasetDir,0,steamIDIndexAtPercent), batch_size = data_loader_batch_size, shuffle=True)
print("feeding test dataset")
test_dl = DataLoader(CustomGameDataset(newDatasetDir,steamIDIndexAtPercent,steamIDIndexAtEnd), batch_size = data_loader_batch_size, shuffle=True)
print("setting loss and optimization")
loss_fn = custom_loss
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

for t in range(epochs):
    print(f"Epoch {t+1}\n-------------------------------")
    train_loop(train_dl, model, loss_fn, optimizer)
    test_loop(test_dl, model, loss_fn)
torch.save(model,'model2.pth')
print("Done!")

NeuralNetwork(
  (linear_hardtanh_stack): Sequential(
    (0): Linear(in_features=9800, out_features=9800, bias=True)
    (1): Hardtanh(min_val=-3, max_val=12)
    (2): Linear(in_features=9800, out_features=9800, bias=True)
    (3): Hardtanh(min_val=-3, max_val=12)
    (4): Linear(in_features=9800, out_features=9800, bias=True)
  )
)
looking for 75 percent index
found index at  68360588
feeding train dataset
init called
feeding test dataset
init called
setting loss and optimization
Epoch 1
-------------------------------
test
setting modle .train
finished setting modle .train
batch:  1 / 67
batch:  11 / 67
batch:  21 / 67
batch:  31 / 67
batch:  41 / 67
batch:  51 / 67
batch:  61 / 67
batch:  1 / 54
batch:  11 / 54
batch:  21 / 54
batch:  31 / 54
batch:  41 / 54
batch:  51 / 54
Test Error: Avg loss: 0.000068 

Epoch 2
-------------------------------
test
setting modle .train
finished setting modle .train
batch:  1 / 67
batch:  11 / 67
batch:  21 / 67
batch:  31 / 67
batch:  41 / 67
bat