In [1]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from sklearn.linear_model import LinearRegression
from matplotlib import pyplot


In [2]:
data = pd.read_csv("dataset.csv")
del (data["Unnamed: 0"]) # Deleting unwanted column

original_length = len(data)
data = data.dropna() ##Dropping NaN values from the dataset
print(original_length - len(data), "NaN items dropped.")

# Removing whitespace from some elements
data['quality'] = data['quality'].str.lstrip()
data['gun'] = data['gun'].str.rstrip()

data.head()

342 NaN items dropped.


Unnamed: 0,gun,skin,rarity,stattrak,quality,price,quantity
0,AK-47,Point Disarray,Classified,0,Factory New,45.05,44.0
1,AK-47,Point Disarray,Classified,0,Minimal Wear,31.35,81.0
2,AK-47,Point Disarray,Classified,0,Field-Tested,19.59,122.0
3,AK-47,Point Disarray,Classified,0,Well-Worn,22.82,17.0
4,AK-47,Point Disarray,Classified,0,Battle-Scarred,16.42,40.0


In [3]:
# Finding which strings we need to map to touch up the dataset
print(data["rarity"].unique())
print()
print(data['quality'].unique()) 

['Classified' 'Restricted' 'Covert' 'Mil-Spec' 'Industrial' 'Consumer']

['Factory New' 'Minimal Wear' 'Field-Tested' 'Well-Worn' 'Battle-Scarred']


In [4]:
rarity_map = {"Consumer": 1, "Industrial": 2, "Mil-Spec": 3, "Restricted": 4,
              "Classified": 5, "Covert": 6, "Contraband": 7}

quality_map = {'Battle-Scarred': 1, 'Well-Worn': 2, 'Field-Tested': 3, 'Minimal Wear': 4, 'Factory New': 5}

data["rarity"] = data["rarity"].map(rarity_map) ## Mapping numeric values to rarity
data['quality'] = data['quality'].map(quality_map) # Mapping numeric values to quality

# Replacing some NaN values
data['quantity'] = data['quantity'].fillna(0)

print(data)

         gun             skin  rarity  stattrak  quality   price  quantity
0      AK-47   Point Disarray       5         0        5   45.05      44.0
1      AK-47   Point Disarray       5         0        4   31.35      81.0
2      AK-47   Point Disarray       5         0        3   19.59     122.0
3      AK-47   Point Disarray       5         0        2   22.82      17.0
4      AK-47   Point Disarray       5         0        1   16.42      40.0
...      ...              ...     ...       ...      ...     ...       ...
9867  XM1014     Teclu Burner       4         1        5    9.98      15.0
9868  XM1014     Teclu Burner       4         1        4     5.6      18.0
9869  XM1014     Teclu Burner       4         1        3     2.9      31.0
9870  XM1014     Teclu Burner       4         1        2     3.8      11.0
9871  XM1014     Teclu Burner       4         1        1    2.37      11.0

[9530 rows x 7 columns]


In [5]:
data["price"] = pd.to_numeric(data["price"], downcast="float")

In [6]:
pistols = ['CZ75-Auto', 'Desert Eagle', 'Dual Berettas', 'Five-SeveN', 'Glock-18', 'P2000', 'P250', 'R8 Revolver', 'Tec-9', 'USP-S']
rifles = ['AK-47', 'AUG', 'AWP', 'FAMAS', 'G36G1', 'Galil AR', 'M4A1-S', 'M4A4', 'SCAR-20', 'SG 553', 'SSG 08']
smgs = ['MAC-10', 'MP5-SD', 'MP7', 'MP9', 'PP-Bizon', 'P90', 'P90', 'UMP-45']
heavies = ['MAG-7', 'Nova', 'Sawed-Off', 'XM1014', 'M249', 'Negev']
knives = ['★ Nomad Knife', '★ Skeleton Knife', '★ Survival Knife', '★ Paracord Knife', '★ Classic Knife', '★ Navaja Knife', '★ Stiletto Knife', '★ Talon Knife', '★ Ursus Knife', '★ Bayonet', '★ Bowie Knife', '★ Butterfly Knife', '★ Flachion Knife', '★ Flip Knife', '★ Gut Knife', '★ Huntsman Knife', '★ Karambit', '★ M9 Bayonet', '★ Shadow Daggers']

categories = []

for weapon in data['gun']:
    #print(type(weapon))
    if weapon in pistols:
        categories.append('pistol')
    elif weapon in rifles:
        categories.append('rifle')
    elif weapon in smgs:
        categories.append('smg')
    elif weapon in heavies:
        categories.append('heavy')
    elif weapon in knives:
        categories.append('knife')
    else:
        categories.append('none')
        
data['category'] = categories
data.head()

Unnamed: 0,gun,skin,rarity,stattrak,quality,price,quantity,category
0,AK-47,Point Disarray,5,0,5,45.049999,44.0,rifle
1,AK-47,Point Disarray,5,0,4,31.35,81.0,rifle
2,AK-47,Point Disarray,5,0,3,19.59,122.0,rifle
3,AK-47,Point Disarray,5,0,2,22.82,17.0,rifle
4,AK-47,Point Disarray,5,0,1,16.42,40.0,rifle


In [7]:
data.to_csv('cleaned dataset with categories.csv')