Skip to content

Commit

Permalink
return sklearn Ordinal encoder to compare random with samples datasets.
Browse files Browse the repository at this point in the history
  • Loading branch information
aboelhamd committed Aug 19, 2019
1 parent 5622d4b commit 1e8268d
Showing 1 changed file with 6 additions and 6 deletions.
12 changes: 6 additions & 6 deletions sklearn-train.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,17 +39,17 @@
classifiers = [SVC(kernel="linear", C=0.025)]

print("file name :", file)
data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:100000]
data = pd.read_csv(files[file], delimiter=r"\s+").dropna().iloc[:200000]

# if records equals to classes number, duplicates the data
if data.shape[0] == data.iloc[:,0].nunique():
data = data.append(data)

# words(features) encoding
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
features = enc.fit_transform(data.iloc[:,2:]).toarray()
# words (features) encoding
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder(dtype=np.int32)
features = enc.fit_transform(data.iloc[:,2:])

# save the encoder
enc_name = os.path.join(models_path, 'encoder'+'-'+file_no_ext)[:256]
joblib.dump(enc, enc_name)
Expand Down

0 comments on commit 1e8268d

Please sign in to comment.