In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


In [None]:
from google.colab import files
uploaded = files.upload()

Saving format.dat.txt to format.dat.txt
Saving train.dat.txt to train.dat.txt
Saving test.dat.txt to test.dat.txt


In [8]:
from google.colab import files
uploaded = files.upload()

Saving test.dat to test.dat
Saving train.dat to train.dat


In [12]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline

# Read in the dataset
with open('train.dat', 'r') as file:
    train_data = file.readlines()

with open('test.dat', 'r') as file:
    test_data = file.readlines()

# Separate class labels from descriptions
train_cls, train_docs = [], []
for line in train_data:
    parts = line.strip().split('\t')
    train_cls.append(parts[0])
    train_docs.append(parts[1])

test_docs = [line.strip() for line in test_data]

# Create a DataFrame for the training data
train_df = pd.DataFrame({'Class': train_cls, 'Description': train_docs})

# Step 2: Create Vectors using CountVectorizer
count_vect = CountVectorizer()
train_counts = count_vect.fit_transform(train_df['Description'])

# Step 3: Reduce the weightage of common words using TfidfTransformer
tfidf_transformer = TfidfTransformer()
train_tfidf = tfidf_transformer.fit_transform(train_counts)

# Step 4: Train the Naive Bayes classifier
clf = MultinomialNB().fit(train_tfidf, train_df['Class'])

# Step 5: Use a pipeline for building the model
text_classifier = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('svc', LinearSVC()),
])
text_classifier = text_classifier.fit(train_df['Description'], train_df['Class'])

# Load the test data
with open('test.dat', 'r') as file:
    test_data = file.readlines()

test_docs = [line.strip() for line in test_data]

# Predict the results
predicted = text_classifier.predict(test_docs)

# Create a DataFrame for the predicted results
result = pd.DataFrame({'Predicted': predicted})

# Save the predicted results to a file
result.to_csv('result_predicted_svcdata.dat', index=False, header=False)


In [13]:
result.to_csv('result_predicted_svcdata.csv', index=False, header=False)


In [26]:
import numpy as np
import pandas as pd
from collections import Counter, defaultdict
from scipy.sparse import csr_matrix
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import SGDClassifier

# Replace 'train.data' and 'test.data' with 'train.dat' and 'test.dat'
with open('train.dat', 'r') as file:
    train_lines = file.read().splitlines()

with open('test.dat', 'r') as file:
    test_lines = file.read().splitlines()

# Separate names from classes
vals = [line[2:] for line in train_lines]
tvals = test_lines

docs = vals
cls = [line[0] for line in train_lines]
test_docs = tvals

tcls = [int(i) for i in cls]

# Initialize class counters
c1, c2, c3, c4, c5 = 0, 0, 0, 0, 0

for i in tcls:
    if i == 1:
        c1 += 1
    elif i == 2:
        c2 += 1
    elif i == 3:
        c3 += 1
    elif i == 4:
        c4 += 1
    elif i == 5:
        c5 += 1

print("c1 - c2 - c3 - c4 - c5 \n", c1, c2, c3, c4, c5)
print("Total classes - ", c1 + c2 + c3 + c4 + c5)

# Continue with the rest of the code
tcls = np.asarray(tcls)
docs = [d.split() for d in docs]
test_docs = [td.split() for td in test_docs]




c1 - c2 - c3 - c4 - c5 
 3163 1494 1925 3051 4805
Total classes -  14438


In [28]:
def filterLen(docs, minlen):
    return [[t for t in d if len(t) >= minlen] for d in docs]

docs = filterLen(docs, 4)
test_docs = filterLen(test_docs, 4)

temp_docs = []
for doc in docs:
    temp_doc = []
    for word in doc:
        temp = ''.join(c for c in word if c.isalnum())
        temp_doc.append(temp.lower())
    temp_docs.append(temp_doc)
docs = temp_docs

temp_docs = []
for doc in test_docs:
    temp_doc = []
    for word in doc:
        temp = ''.join(c for c in word if c.isalnum())
        temp_doc.append(temp.lower())
    temp_docs.append(temp_doc)
test_docs = temp_docs

def build_matrix(docs):
    nrows = len(docs)
    idx = {}
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
        for w in d:
            if w not in idx:
                idx[w] = tid
                tid += 1
    ncols = len(idx)

    ind = np.zeros(nnz, dtype=int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows + 1, dtype=int)
    i = 0
    n = 0
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k, _ in cnt.most_common())
        l = len(keys)
        for j, k in enumerate(keys):
            ind[j + n] = idx[k]
            val[j + n] = cnt[k]
        ptr[i + 1] = ptr[i] + l
        n += l
        i += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    return mat, idx

train_mat, idx = build_matrix(docs)

def build_matrix_test(docs, idx):
    nrows = len(docs)
    tid = 0
    nnz = 0
    for d in docs:
        nnz += len(set(d))
    ncols = len(idx)

    ind = np.zeros(nnz, dtype=int)
    val = np.zeros(nnz, dtype=np.double)
    ptr = np.zeros(nrows + 1, dtype=int)
    i = 0
    n = 0
    for d in docs:
        cnt = Counter(d)
        keys = list(k for k, _ in cnt.most_common())
        l = len(keys)
        for j, k in enumerate(keys):
            temp = idx.get(k, -1)
            if temp != -1:
                ind[j + n] = temp
                val[j + n] = cnt[k]
        ptr[i + 1] = ptr[i] + l
        n += l
        i += 1

    mat = csr_matrix((val, ind, ptr), shape=(nrows, ncols), dtype=np.double)
    mat.sort_indices()
    return mat

test_mat = build_matrix_test(test_docs, idx)

def csr_idf(mat, copy=False, **kargs):
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    df = defaultdict(int)
    for i in ind:
        df[i] += 1
    for k, v in df.items():
        df[k] = np.log(nrows / float(v))
    for i in range(0, nnz):
        val[i] *= df[ind[i]]
    return df if copy is False else mat

def csr_l2normalize(mat, copy=False, **kargs):
    if copy is True:
        mat = mat.copy()
    nrows = mat.shape[0]
    nnz = mat.nnz
    ind, val, ptr = mat.indices, mat.data, mat.indptr
    for i in range(nrows):
        rsum = 0.0
        for j in range(ptr[i], ptr[i + 1]):
            rsum += val[j] ** 2
        if rsum == 0.0:
            continue
        rsum = 1.0 / np.sqrt(rsum)
        for j in range(ptr[i], ptr[i + 1]):
            val[j] *= rsum
    if copy is True:
        return mat

train_mat1 = csr_idf(train_mat, copy=True)
train_mat2 = csr_l2normalize(train_mat1, copy=True)

test_mat1 = csr_idf(test_mat, copy=True)
test_mat2 = csr_l2normalize(test_mat1, copy=True)

clf = MultinomialNB().fit(train_mat2, tcls)
predicted = clf.predict(test_mat2)
with open('predictions_1.dat', 'w+') as file:
    for p in predicted:
        file.write(str(p) + "\n")

clf_2 = SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=10, tol=None).fit(train_mat2, tcls)
predicted_2 = clf_2.predict(test_mat2)
with open('predictions_2.dat', 'w+') as file:
    for p in predicted_2:
        file.write(str(p) + "\n")
