In [1]:
# Import Libraries

import numpy as np  
import pandas as pd         
import seaborn as sns   
import matplotlib.pyplot as plt 
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

In [2]:
class evalModel:
    
    def __init__(self, X, y, random_state=42, model=None, scaler = None):
        self.X = X
        self.y = y
        self.random_state = random_state
        self.model = model if model is not None else LinearRegression()
        self.scaler = scaler if scaler is not None else StandardScaler()
        
    def split_data(self):
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.y,
                                                                                test_size=0.2, 
                                                                                random_state=self.random_state
                                                                                    )
        return self
    
    def transform_data(self):
        self.X_train = self.scaler.fit_transform(self.X_train)
        self.X_test = self.scaler.transform(self.X_test)
        return self
        
    def fit_model(self):
        self.linear = self.model.fit(self.X_train, self.y_train)
        return self
        
    def prediction(self):
        self.predict = self.linear.predict(self.y_test)
        return self
        
    def model_metrics(self):
        self.mae = mean_absolute_error(self.y_test, self.predict)
        self.mse = mean_squared_error(self.y_test, self.predict)
        self.r2 = r2_score(self.y_test, self.predict)
        return {"mae": round(self.mae, 3), "mse": round(self.mse, 3), "r2": round(self.r2, 3)}

    def run(self):
        self.split_data().transform_data().fit_model().prediction()
        return self.model_metrics()    
        

In [3]:
df = pd.read_csv("StudentPerformance.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 6 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Hours Studied                     10000 non-null  int64  
 1   Previous Scores                   10000 non-null  int64  
 2   Extracurricular Activities        10000 non-null  object 
 3   Sleep Hours                       10000 non-null  int64  
 4   Sample Question Papers Practiced  10000 non-null  int64  
 5   Performance Index                 10000 non-null  float64
dtypes: float64(1), int64(4), object(1)
memory usage: 468.9+ KB


In [4]:
df.columns = df.columns.str.replace(" ", "_").str.lower().str.strip()
df.columns

Index(['hours_studied', 'previous_scores', 'extracurricular_activities',
       'sleep_hours', 'sample_question_papers_practiced', 'performance_index'],
      dtype='object')

In [5]:
# X = df.drop(columns=['extracurricular_activities','hours_studied']).values
X = df[['hours_studied']].values

y = df['previous_scores'].values.reshape(-1, 1)

print(X.shape)
print(y.shape)

evaluator = evalModel(X, y).run()
print(evaluator)


(10000, 1)
(10000, 1)
{'mae': 19.939, 'mse': 582.951, 'r2': -0.94}


In [6]:
mydec = pd.DataFrame({"sampleID":["IZ23444",
                                 "IZ23444",
                                 "IZ00993",
                                 "IZ34555"]}
                        )

mydec

Unnamed: 0,sampleID
0,IZ23444
1,IZ23444
2,IZ00993
3,IZ34555


In [7]:
x = "IZ23"
y = 1
print(int(x[-1]) +y)

" ".join(x +"_1")

4


'I Z 2 3 _ 1'

In [8]:
bolval = mydec["sampleID"].duplicated()

for sampleid, booval in zip(mydec["sampleID"], bolval):
    if booval is True:
        sampleid = sampleid + "_1"
        print(sampleid)
        
        

IZ23444_1


In [9]:
boolval = mydec["sampleID"].duplicated()
myseries = mydec["sampleID"]

mynew = pd.concat([myseries, boolval], axis=1)

mynew2 = mynew.copy()

mynew3 = mynew2.rename(columns={"sampleID":"sampID",
                              "sampleID": "booID"})

mynew3.reset_index()

Unnamed: 0,index,booID,booID.1
0,0,IZ23444,False
1,1,IZ23444,True
2,2,IZ00993,False
3,3,IZ34555,False


In [10]:
def comb_data(mydec):
    
    boolval = mydec["sampleID"].duplicated()

    for sampleid, booval in zip(mydec["sampleID"], boolval):
        if booval is True:
            sampleid = sampleid + "_1"
            
    return mydec

comb_data(mydec)

Unnamed: 0,sampleID
0,IZ23444
1,IZ23444
2,IZ00993
3,IZ34555


In [11]:
myset = list(set(mydec["sampleID"]))
print(myset)

['IZ23444', 'IZ34555', 'IZ00993']


In [12]:

myset = list(set(mydec["sampleID"]))

for index,row in mydec["sampleID"].items():
    if row in myset:
        
        print(row)

IZ23444
IZ23444
IZ00993
IZ34555


In [13]:
mydict = {}
rep = []

for index, row in mydec["sampleID"].items():
    mydict[row] = mydict.get(row,0)+1

for key,value in mydict.items():
    if value > 1:
        rep.append(key)

 ABC -- EASY AS DO RA ME

In [45]:
mydec = pd.DataFrame({"sampleID":["IZ23444",
                                 "IZ23444",
                                 "IZ00993",
                                 "IZ34555"]}
                        )

mydec

Unnamed: 0,sampleID
0,IZ23444
1,IZ23444
2,IZ00993
3,IZ34555


In [46]:
mydec["cumcount"] = mydec.groupby("sampleID").cumcount()
# mydec["newID"] = mydec["sampleID"] + str
mydec

Unnamed: 0,sampleID,cumcount
0,IZ23444,0
1,IZ23444,1
2,IZ00993,0
3,IZ34555,0


In [47]:
mydec["cumcount"] = mydec.groupby("sampleID").cumcount()

for idx, row in mydec.iterrows():
    if row["cumcount"] >= 1:
        mydec.loc[idx, "sampleID"] = f"{row['sampleID']}_{row['cumcount']}"
        

mydec  


Unnamed: 0,sampleID,cumcount
0,IZ23444,0
1,IZ23444_1,1
2,IZ00993,0
3,IZ34555,0


In [48]:
for idx, row in mydec.iterrows():
    print(row["sampleID"])

IZ23444
IZ23444_1
IZ00993
IZ34555
