In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [2]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from collections import Counter
from zipfile import ZipFile
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from random import shuffle
import os
import operator
import nltk
import re
import string


In [3]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
os.chdir('/content/drive/MyDrive/Assignment-2-IR/')

In [7]:
zip_ref = ZipFile("20_newsgroups.zip", 'r')
zip_ref.extractall()
zip_ref.close()

In [5]:
class_list=[]
list_labels=['talk.politics.misc','comp.graphics',  'sci.space','rec.sport.hockey', 'sci.med' ] 
list_files=[]
for label in list_labels:
  for path_root, _, files_list in os.walk(str(os.getcwd())+'/'+"20_newsgroups"+'/'+str(label)):
      for classFile in files_list:
          path_File = os.path.join(path_root, classFile)
          list_files.append(path_File)
          class_list.append(label)
        


In [6]:
len(class_list),len(list_files)

(5000, 5000)

In [7]:
from typing import TextIO
def lower_case(text):
    lower_case_text = text.str.lower()
    return lower_case_text

def stop_word(text):
  sentence = []
  stop_words = set(stopwords.words("english"))
  for w in text:
        if w not in stop_words:
            sentence.append(w)
        else:
          continue
  return " ".join(sentence)

def remove_punc(text):
  punc_tokenizer = nltk.RegexpTokenizer(r"\w+")
  text = punc_tokenizer.tokenize(text)
  return text

def conversion(text):
  text = np.char.replace(text, "0", " zero ")
  text = np.char.replace(text, "1", " one ")
  text = np.char.replace(text, "2", " two ")
  text = np.char.replace(text, "3", " three ")
  text = np.char.replace(text, "4", " four ")
  text = np.char.replace(text, "5", " five ")
  text = np.char.replace(text, "6", " six ")
  text = np.char.replace(text, "7", " seven ")
  text = np.char.replace(text, "8", " eight ")
  text = np.char.replace(text, "9", " nine ")
  return text

def num2Words(text):
  return [word for word in conversion(text) ]

def lemmatization(text):
  lemmatizer = WordNetLemmatizer()
  tokenizer=nltk.tokenize.WhitespaceTokenizer()
  text = [lemmatizer.lemmatize(word) for word in tokenizer.tokenize(text)]
  return text 


In [8]:
def preProcessText(text):
  text =lower_case(text)
  text = text.str.replace('\[.*?\]', '')
  text = text.str.replace('https?://\S+|www\.\S+', '')
  text = text.str.replace('<.*?>+', '')
  text = text.str.replace('\n', '')
  text = text.str.replace('[%s]' % re.escape(string.punctuation), '')
  text = text.str.replace('\n', '')
  text = stop_word(text)
  text = lemmatization(text)
  return " ".join(text)

In [9]:
list_docs = []
for path in list_files:
  readFile = open(path, 'r',encoding='cp1250')
  txt = readFile.read().strip()
  readFile.close()
  list_docs.append(txt)
df_docs = pd.DataFrame([list_docs,class_list]).T
df_docs[0] = preProcessText(df_docs[0])
df_docs.to_pickle("df_docs")

  This is separate from the ipykernel package so we can avoid doing imports until
  after removing the cwd from sys.path.
  """
  import sys


In [10]:
df_docs

Unnamed: 0,0,1
0,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,talk.politics.misc
1,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,talk.politics.misc
2,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,talk.politics.misc
3,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,talk.politics.misc
4,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,talk.politics.misc
...,...,...
4995,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,sci.med
4996,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,sci.med
4997,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,sci.med
4998,xref cantaloupesrvcscmuedu altnewsmedia739 alt...,sci.med


In [11]:
def createClassBasedWords(data,class_list):
  temp = {}
  for j in range(len(data)):
    if class_list[j] not in temp.keys():
      temp[class_list[j]] = data[j]
    else:
      temp[class_list[j]] += data[j]
  return temp

def countWordPerClass(class_dict):
  word_dict = {}
  for classT in class_dict:
    words = set(class_dict[classT])
    for word in words:
      if word in word_dict.keys():
        word_dict[word] +=1
      else:
        word_dict[word] =1
  print(word_dict)
  return word_dict

def tf_Icf(words,class_dict,word_dict):
  count_words = Counter(words)
  tf_icf = {}
  for word in set(words):
    tf_value = count_words[word]
    icf_value = np.log(len(class_dict)/word_dict[word])
    tf_icf[word] = tf_value*icf_value
  print(tf_icf)
  return tf_icf

def fit(data):
  temp = createClassBasedWords(data[0].tolist(),data[0].tolist())
  words_list = []
  for i in temp:
    words_list += temp[i]
  word_dict = countWordPerClass(temp)
  tf_icf = tf_Icf(words_list,temp,word_dict)
  tf_icf_sorted = sorted(tf_icf.items(), key=lambda t: t[1],reverse=True)
  unique_words = [word_tf_icf[0] for word_tf_icf in tf_icf_sorted[:int(len(tf_icf_sorted)*10/100)]]
  wordFreCl= {}
  numWorCl={}


In [None]:
ratio_list = [0.5,0.7,0.8]
tmp_list = []
for ratio in ratio_list:
  data_train = df_docs.sample(frac=ratio,random_state=42)
  data_test = df_docs[~df_docs.index.isin(data_train.index)]
  fit(data_train)
