In [1]:
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MinMaxScaler, LabelEncoder

# 1. Converting our Dataset Values from Turkish to English

## 1.1 Loading our dataset

In [2]:
# Load the dataset
df = pd.read_csv('15.csv')
df

Unnamed: 0,Marka,Fiyat,İşlemci Tipi,SSD Kapasitesi,Ram (Sistem Belleği),Ekran Kartı,Kapasite,İşletim Sistemi,Ekran Kartı Bellek Tipi,Ekran Kartı Tipi,...,Ekran Kartı Hafızası,Temel İşlemci Hızı (GHz),Bağlantılar,Cihaz Ağırlığı,Ekran Boyutu,İşlemci Frekansı,Ekran Yenileme Hızı,Panel Tipi,Menşei,Arttırılabilir Azami Bellek
0,XASER,13716.90,Intel Core i5,512 GB,16 GB,AMD Radeon RX 550,512 GB,Free Dos (İşletim Sistemi Yok),GDDR5,Harici,...,,,,,,,,,,
1,DMC,13678.00,Intel Core i5,512 GB,16 GB,Nvidia Geforce GT 740,Yok,Windows,DDR3,Harici,...,4 GB ve altı,3.2,HDMI,Belirtilmemiş,24 inç,3.00 GHz üstü,75 Hz,,,
2,XASER,12857.74,Intel Core i5,512 GB,16 GB,AMD Radeon RX 580,512 GB,Free Dos (İşletim Sistemi Yok),GDDR5,Harici,...,,,,,,,,,,
3,DMC,20799.00,Intel Core i5,256 GB,32 GB,AMD Radeon RX 580,256 GB,Windows,GDDR5,Harici,...,8 GB,2.9,HDMI,Belirtilmemiş,"24"" / 61 Ekran",,165 Hz,VA,TR,
4,XASER,12834.62,Intel Core i5,512 GB,16 GB,AMD Radeon RX 550,512 GB,Free Dos (İşletim Sistemi Yok),GDDR5,Harici,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2462,HP,42606.03,Intel Core i7,1 TB,64 GB,Nvidia GeForce GT 730,Yok,Windows,DDR3,Harici,...,4 GB ve altı,1.5,USB,,,,,,,64 GB
2463,HP,51946.97,Intel Core i7,2 TB,64 GB,Nvidia GeForce GTX 1650,Yok,Windows,GDDR6,Harici,...,4 GB ve altı,1.5,USB,,,,,,,64 GB
2464,HP,43954.05,Intel Core i7,4 TB,12 GB,Dahili Ekran Kartı,Yok,Windows,Dahili,Dahili,...,Paylaşımlı,1.5,USB,,,,,,,64 GB
2465,HP,43963.48,Intel Core i7,2 TB,48 GB,Nvidia GeForce GT 730,Yok,Windows,DDR3,Harici,...,4 GB ve altı,1.5,USB,,,,,,,64 GB


## 1.2 Mapping Attributes from Turkish to English

In [3]:
# Display original columns
print("Original columns:")
print(df.columns.tolist())
print("\n")

# Dictionary for translating Turkish column names to English
column_translation = {
    'Marka': 'Brand',
    'Fiyat': 'Price',
    'İşlemci Tipi': 'Processor_Type',
    'SSD Kapasitesi': 'SSD_Capacity',
    'Ram (Sistem Belleği)': 'RAM',
    'Ekran Kartı': 'Graphics_Card',
    'Kapasite': 'Capacity',
    'İşletim Sistemi': 'Operating_System',
    'Ekran Kartı Bellek Tipi': 'Graphics_Memory_Type',
    'Ekran Kartı Tipi': 'Graphics_Card_Type',
    'Garanti Tipi': 'Warranty_Type',
    'Ram (Sistem Belleği) Tipi': 'RAM_Type',
    'İşlemci Çekirdek Sayısı': 'Processor_Core_Count',
    'İşlemci Nesli': 'Processor_Generation',
    'İşlemci Modeli': 'Processor_Model',
    'Çözünürlük': 'Resolution',
    'Power Supply': 'Power_Supply',
    'Kullanım Amacı': 'Usage_Purpose',
    'Ekran Kartı Hafızası': 'Graphics_Card_Memory',
    'Temel İşlemci Hızı (GHz)': 'Base_Processor_Speed_GHz',
    'Bağlantılar': 'Connections',
    'Cihaz Ağırlığı': 'Device_Weight',
    'Ekran Boyutu': 'Screen_Size',
    'İşlemci Frekansı': 'Processor_Frequency',
    'Ekran Yenileme Hızı': 'Screen_Refresh_Rate',
    'Panel Tipi': 'Panel_Type',
    'Menşei': 'Country_of_Origin',
    'Arttırılabilir Azami Bellek': 'Expandable_Max_Memory'
}

# Rename columns
df = df.rename(columns=column_translation)

# Display translated columns
print("Translated columns:")
print(df.columns.tolist())
print("\n")

# Display data info before preprocessing
print("Data info before preprocessing:")
print(df.info())
print("\n")


Original columns:
['Marka', 'Fiyat', 'İşlemci Tipi', 'SSD Kapasitesi', 'Ram (Sistem Belleği)', 'Ekran Kartı', 'Kapasite', 'İşletim Sistemi', 'Ekran Kartı Bellek Tipi', 'Ekran Kartı Tipi', 'Garanti Tipi', 'Ram (Sistem Belleği) Tipi', 'İşlemci Çekirdek Sayısı', 'İşlemci Nesli', 'İşlemci Modeli', 'Çözünürlük', 'Power Supply', 'Kullanım Amacı', 'Ekran Kartı Hafızası', 'Temel İşlemci Hızı (GHz)', 'Bağlantılar', 'Cihaz Ağırlığı', 'Ekran Boyutu', 'İşlemci Frekansı', 'Ekran Yenileme Hızı', 'Panel Tipi', 'Menşei', 'Arttırılabilir Azami Bellek']


Translated columns:
['Brand', 'Price', 'Processor_Type', 'SSD_Capacity', 'RAM', 'Graphics_Card', 'Capacity', 'Operating_System', 'Graphics_Memory_Type', 'Graphics_Card_Type', 'Warranty_Type', 'RAM_Type', 'Processor_Core_Count', 'Processor_Generation', 'Processor_Model', 'Resolution', 'Power_Supply', 'Usage_Purpose', 'Graphics_Card_Memory', 'Base_Processor_Speed_GHz', 'Connections', 'Device_Weight', 'Screen_Size', 'Processor_Frequency', 'Screen_Refresh_

In [4]:
# Converting nan/NA values written in Turkish to English
df.replace("BelirtilmemiÅŸ ", np.nan, inplace=True) #"Belirtilmemiş" is a Turkish word that means "unspecified" or "not stated."
df.replace("Belirtilmemiş",np.nan,inplace=True) # Similar to the previous one
df.replace("Yok", np.nan, inplace=True) # "Yok" is a Turkish word that means "no," "not available," or "does not exist."

## 1.3 Missing Values and Removal of Attributes

In [5]:
# Check for missing values
print("\nMissing values per column:")
print(df.isnull().sum())


Missing values per column:
Brand                          0
Price                          0
Processor_Type                 5
SSD_Capacity                 201
RAM                          200
Graphics_Card                 12
Capacity                     908
Operating_System              13
Graphics_Memory_Type          42
Graphics_Card_Type            36
Warranty_Type                  7
RAM_Type                     143
Processor_Core_Count          46
Processor_Generation          58
Processor_Model               58
Resolution                  1733
Power_Supply                2171
Usage_Purpose                331
Graphics_Card_Memory         268
Base_Processor_Speed_GHz    1205
Connections                  293
Device_Weight               2160
Screen_Size                 1389
Processor_Frequency         2179
Screen_Refresh_Rate         2361
Panel_Type                  2339
Country_of_Origin           2387
Expandable_Max_Memory        316
dtype: int64


In [6]:
# --- Step 2: Removing attributes with more than 70% of the data missing ---

data_threshold=0.7*len(df)
deleted_columns=[]
for col in df.columns:
    if df[col].isnull().sum()> data_threshold:
        deleted_columns.append(col)
        df.drop(columns=col,inplace=True)
        
print("The columns that were removed are: ",deleted_columns)
df.info()

print("Number of duplicates before deletion: ")
print(df.duplicated().sum())

# Removing duplicated tuples
df=df.drop_duplicates()

print("Number of duplicates after deletion: ")
print(df.duplicated().sum())

The columns that were removed are:  ['Resolution', 'Power_Supply', 'Device_Weight', 'Processor_Frequency', 'Screen_Refresh_Rate', 'Panel_Type', 'Country_of_Origin']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2467 entries, 0 to 2466
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Brand                     2467 non-null   object 
 1   Price                     2467 non-null   float64
 2   Processor_Type            2462 non-null   object 
 3   SSD_Capacity              2266 non-null   object 
 4   RAM                       2267 non-null   object 
 5   Graphics_Card             2455 non-null   object 
 6   Capacity                  1559 non-null   object 
 7   Operating_System          2454 non-null   object 
 8   Graphics_Memory_Type      2425 non-null   object 
 9   Graphics_Card_Type        2431 non-null   object 
 10  Warranty_Type             2460 non-null   object 
 11  RAM_Type

## 1.4 Other necessary conversions from Turkish to English

In [7]:
# Function to generate random numbers (1, 2, 3, or 4)
def generate_number():
    return random.choice([1, 2, 4])

# Replacing major values to English
df.replace("Dahili Ekran Kartı","Integrated Graphics Card",inplace=True) #"Dahili Ekran Kartı" translates to "Integrated Graphics Card" in English.
df.replace("Free Dos (İşletim Sistemi Yok)","Free Dos (No Operating System)",inplace=True)
df.replace("Dahili","Internal",inplace=True)
df.replace("Harici","External",inplace=True)
df.replace("Paylaşımlı",2,inplace=True)
df.replace("3 - 4",3.5,inplace=True)
df.replace("SSD Yok",0,inplace=True)
df.replace("1 TB",1024,inplace=True)
df.replace("2 TB",2048,inplace=True)
df.replace("4 TB",4096,inplace=True)
df.replace("12+",12,inplace=True)
df.replace("Kablo","Cable",inplace=True)
df.replace("İ5-3470", "i5-3470",inplace=True)
df.replace("İ5-650","i5-650",inplace=True)

# Dictionary mapping Turkish to English
turkish_to_english_up= {
    "Ev - Okul": "Home - School",
    "Ofis - İş": "Office - Work",
    "Oyun": "Gaming",
    "Tasarım": "Design"
}

# Mapping dictionary to convert screen sizes to numeric values
screen_size_map = {
    '23.8 inç': 23.8,
    '24 inç': 24.0,
    '24" / 61 Ekran': 24.0,  # We can treat this as 24
    '19" / 48 Ekran': 19.0,  # We can treat this as 19
    '5 - 5,5 inç': 5.25,  # Averaging the range (5 + 5.5) / 2
    '22 - 24 inç': 23.0,  # Averaging the range (22 + 24) / 2
    '5,5 - 6 inç': 5.75,  # Averaging the range (5.5 + 6) / 2
    '27 inç': 27.0,
    '22 inç': 22.0,
    '21,5 inç': 21.5,
    '18 - 21 inç': 19.5,  # Averaging the range (18 + 21) / 2
    '20 inç': 20.0,
    '6 inç ve altı': 6.0  # Taking 6 as the value
}

# Dictionary mapping Turkish to English
turkish_to_english_guarantee = {
    'Resmi Distribütör Garantili': 'Official Distributor Guaranteed',
    'Zeiron Türkiye Garantili': 'Zeiron Turkey Guaranteed',
    'İthalatçı Garantili': 'Importer Guaranteed',
    'İzoly Türkiye Garantisi': 'Izoly Turkey Warranty',
    'Apple Türkiye Garantili': 'Apple Turkey Guaranteed',
    'Technopc Türkiye Garantili': 'Technopc Turkey Guaranteed',
    'Lenovo Türkiye Garantili': 'Lenovo Turkey Guaranteed',
    'Asus Türkiye Garantili': 'Asus Turkey Guaranteed',
    'HP Türkiye Garantili': 'HP Turkey Guaranteed',
    'ACER Türkiye Garantili': 'ACER Turkey Guaranteed',
    'Casper Türkiye Garantili': 'Casper Turkey Guaranteed',
    'Samsung TR Garantili': 'Samsung TR Guaranteed',
    'Dell Türkiye Garantili': 'Dell Turkey Guaranteed'
}

# Convert the column to just numbers (removing the "GB", "TB", etc.)
df['SSD_Capacity'] = df['SSD_Capacity'].apply(lambda x: int(str(x).split()[0]) if pd.notnull(x) else x)
df['RAM'] = df['RAM'].apply(lambda x: int(str(x).split()[0]) if pd.notnull(x) else x)
df['Capacity'] = df['Capacity'].apply(lambda x: int(str(x).split()[0]) if pd.notnull(x) else x)
df['Expandable_Max_Memory'] = df['Expandable_Max_Memory'].apply(lambda x: int(str(x).split()[0]) if pd.notnull(x) else x)
df['Graphics_Card_Memory'] = df['Graphics_Card_Memory'].apply(lambda x: int(str(x).split()[0]) if pd.notnull(x) else x)
df['Processor_Generation'] = df['Processor_Generation'].apply(lambda x: int(str(x).split('.')[0]) if pd.notnull(x) else x)

# Convert non-null values in 'Processor_Core_Count' to int
df['Processor_Core_Count'] = df['Processor_Core_Count'].apply(lambda x: int(x) if pd.notnull(x) else x)

# Convert non-null values in 'Base_Processor_Speed_GHz' to float
df['Base_Processor_Speed_GHz'] = df['Base_Processor_Speed_GHz'].apply(lambda x: float(x) if pd.notnull(x) else x)

# Replace only the values that are "4 GB ve altı"
df.loc[df["Graphics_Card_Memory"] == "4 GB ve altı", "Graphics_Card_Memory"] = df["Graphics_Card_Memory"].map(lambda x: generate_number())

# Example of applying it to a column
df['Warranty_Type'] = df['Warranty_Type'].map(turkish_to_english_guarantee)

# Apply the function to the 'Screen Size' column
df['Screen_Size'] = df['Screen_Size'].map(screen_size_map)

# Apply the mapping to all values in the 'Category' column
df['Usage_Purpose'] = df['Usage_Purpose'].map(turkish_to_english_up)


## 1.5 Displaying all the unique in each attribute

In [8]:
# Get unique values for all columns
unique_values_all_columns = {col: df[col].unique() for col in df.columns if col != 'Price'}

# Print unique values for each column with index and separate lines
for col, unique_vals in unique_values_all_columns.items():
    print(f"\nUnique values in '{col}':")
    for idx, val in enumerate(unique_vals):
        print(f"  [{idx}] {val}")



Unique values in 'Brand':
  [0] XASER
  [1] DMC
  [2] Zeiron
  [3] jetucuzal
  [4] Life Teknoloji
  [5] IZOLY
  [6] TURBOX
  [7] Super
  [8] RAMTECH
  [9] Gamepage
  [10] Apple
  [11] GAMELİNE
  [12] MSI
  [13] METSA
  [14] PCDEPO
  [15] Quantum Gaming
  [16] Canar
  [17] Gigabyte
  [18] ROGAME
  [19] LENOVO
  [20] EFS TEKNOLOJİ
  [21] HP
  [22] OEM
  [23] ASUS
  [24] OXpower
  [25] ARTITEKNİKPC
  [26] TOPLAMA
  [27] Güneysu Gaming
  [28] CASPER
  [29] UCARTECH
  [30] Technopc
  [31] DAGMOR
  [32] WARBOX
  [33] Avantron
  [34] Revenge
  [35] ColdPower
  [36] SECLIFE
  [37] TRİNİTY
  [38] Zetta
  [39] Corsair
  [40] RaXius
  [41] Oksid Bilişim Teknoloji
  [42] Tiwox
  [43] Jedi
  [44] Dell
  [45] Quadro
  [46] Rexdragon
  [47] Grundig
  [48] Redrock
  [49] Gaming Game
  [50] ACER
  [51] Tiranozor

Unique values in 'Processor_Type':
  [0] Intel Core i5
  [1] nan
  [2] Intel Core i7
  [3] AMD
  [4] Intel Core i3
  [5] AMD Ryzen 5
  [6] AMD Ryzen 9
  [7] M2
  [8] Apple M1
  [9] Intel Pent

## 1.6 Dataset after conversion

In [9]:
# Print dataframe info
print("\nDataFrame Info:")
df.info()

df.to_csv("Before-PreProcess.csv",index=False)


DataFrame Info:
<class 'pandas.core.frame.DataFrame'>
Index: 2466 entries, 0 to 2466
Data columns (total 21 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Brand                     2466 non-null   object 
 1   Price                     2466 non-null   float64
 2   Processor_Type            2461 non-null   object 
 3   SSD_Capacity              2266 non-null   float64
 4   RAM                       2267 non-null   float64
 5   Graphics_Card             2454 non-null   object 
 6   Capacity                  1558 non-null   float64
 7   Operating_System          2453 non-null   object 
 8   Graphics_Memory_Type      2424 non-null   object 
 9   Graphics_Card_Type        2430 non-null   object 
 10  Warranty_Type             2457 non-null   object 
 11  RAM_Type                  2324 non-null   object 
 12  Processor_Core_Count      2420 non-null   float64
 13  Processor_Generation      2408 non-null   float64
 