NAME:HARINI   
REG NO: 20BCE1832                                            
WEB MINING LAB WEEK 11


Question 1:

In [2]:
# Implementing sessionization on log entry dataset

# Importing the libraries
from csv import reader
from datetime import datetime

class Sessionize(object):
	"""
	Class that sessionozes the log entries per user.
	The users are identified by using their IP and user agent.
	Then timestamp is used to sessionize the logs based on the H1, H2 and
	H-REF rule that uses the value of delta i.e. the maximum time to be
	considered in a session and theta i.e. the minimum time to be spent
	per page.

	Parameters:
	-----------
	filename: string
		The name of the file that contains the server log entries.

	delta: int
		The maximum time in seconds that can be considered in the same
		session.

	Attributes:
	-----------
	dataset: 2d array
		Contains the entries of the server log file.

	separate: dict
		Contains the unique users using (IP, User-agent) as key and their
		server log entries as value in a list.

	sessions: dict
		Contains the different sessions per user in which the user is the key
		and the differnet sessions are values.

	Returns:
	--------
	"""

	def __init__(self, filename, delta):
		"""Function to initialize the different parameters of the object."""
		self.delta = delta
		csvfile = open(filename, 'r')
		self.dataset_ = list(reader(csvfile))
		self.updateOrderingOfEntries()

	def separateUsers(self):
		"""Function to separate the server log entries based on the user
		i.e. on the basis of the IP and user-agent."""
		self.separate_ = {}
		for row in self.dataset_:
			if row[0] not in self.separate_:
				self.separate_[row[0]] = []
			self.separate_[row[0]].append(row[1:])
		# Updating the timestamp field of the entries.
		self.updateTimestamp()

	def updateTimestamp(self):
		"""Function that updates the timestamp field in a format that
		makes its processing by datetime module easy."""
		for i in self.separate_:
			for j in self.separate_[i]:
				# Trimming unnecessary characters from the entries
				date_time = j[0][1:-6]
				j[0] = date_time

	# def updateOrderingOfEntries(self):
	# 	"""Function to sort the entries in ascending order based on the
	# 	timestamp using Bubble Sort."""

	# 	for i in range(len(self.dataset_) - 1):
	# 		for j in range(i + 1, len(self.dataset_)):
	# 			t1 = datetime.strptime(self.dataset_[i][1][1:-6],
	# 				 "%d/%b/%Y:%H:%M:%S")
	# 			t2 = datetime.strptime(self.dataset_[j][1][1:-6],
	# 				 "%d/%b/%Y:%H:%M:%S")
	# 			if t1 > t2:
	# 				self.dataset_[i], self.dataset_[j] = self.dataset_[j], self.dataset_[i]

	def updateOrderingOfEntries(self):
		"""Function to sort the entries in ascending order based on the
		timestamp using Selection Sort."""

		for i in range(len(self.dataset_)):
			# Find the minimum element in the remaining unsorted array.
			min_idx = i
			t1 = datetime.strptime(self.dataset_[i][1][1:-6],
					 "%d/%b/%Y:%H:%M:%S")
			for j in range(i+1, len(self.dataset_)):
				t2 = datetime.strptime(self.dataset_[j][1][1:-6],
					 "%d/%b/%Y:%H:%M:%S")
				if t1 > t2:
					min_idx = j

			# Swapping the minimum element at ith index
			self.dataset_[i], self.dataset_[min_idx] = self.dataset_[min_idx], self.dataset_[i]

	def createSession(self):
		"""Function to create session for each user based on the different
		rules of sessionization."""
		self.sessions_ = {}
		for i in self.separate_:
			if i not in self.sessions_:
				self.sessions_[i] = []
			for j in range(len(self.separate_[i])):
				temp = []
				present = False
				for l in self.sessions_[i]:
					if self.separate_[i][j] in l:
						present = True
				if not present:
					temp.append(self.separate_[i][j])
					for k in range(j + 1, len(self.separate_[i])):
						t1 = datetime.strptime(self.separate_[i][j][0],
							"%d/%b/%Y:%H:%M:%S")
						t2 = datetime.strptime(self.separate_[i][k][0],
							"%d/%b/%Y:%H:%M:%S")
						latest = max((t1, t2))
						old = min((t1, t2))
						difference = latest - old
						if(difference.seconds <= self.delta):
							temp.append(self.separate_[i][k])
					self.sessions_[i].append(temp)

	def printSessions(self):
		"""Function to print the sessions per user."""
		session_id = 1
		print('%s' % ('-' * 93))
		print('| {:^20} | {:^20} | {:^20} | {:^20} |'.format("Session Id",
			"IP address", "Start Time", "End Time"))
		print('%s' % ('-' * 93))
		for i in self.sessions_:
			for l in self.sessions_[i]:
				dates = []
				for row in l:
					dates.append(datetime.strptime(row[0],
							"%d/%b/%Y:%H:%M:%S"))
				print('| {:^20} | {:^20} | {:^20} | {:^20} |'.format(session_id, i, str(min(dates)), str(max(dates))))
				session_id += 1
		print('%s' % ('-' * 93))

## Main code

filename = input('Enter the name of the dataset: ')
delta = int(input('Enter delta value (minutes): '))
# Coverting the delta form minutes to seconds.
delta *= 60
# Creating the Sessionize object and calling appropriate functions.
session_create = Sessionize(filename, delta)
session_create.separateUsers()
session_create.createSession()
session_create.printSessions()

Enter the name of the dataset: Q1 dataset.csv
Enter delta value (minutes): 30
---------------------------------------------------------------------------------------------
|      Session Id      |      IP address      |      Start Time      |       End Time       |
---------------------------------------------------------------------------------------------
|          1           |    172.20.112.25     | 2000-02-02 10:22:01  | 2000-02-02 10:23:02  |
|          2           |    172.20.112.25     | 2000-02-02 13:10:07  | 2000-02-02 13:10:07  |
|          3           |      12.3.207.3      | 2000-02-02 10:22:02  | 2000-02-02 10:22:02  |
|          4           |      12.3.207.3      | 2000-02-02 11:22:02  | 2000-02-02 11:22:02  |
|          5           |      12.3.207.3      | 2000-02-02 12:02:13  | 2000-02-02 12:02:23  |
---------------------------------------------------------------------------------------------


Question 2:

In [7]:
from csv import reader
from datetime import datetime

class WebSessionAnalyzer:
    def __init__(self, filename, delta_minutes):
        self.delta = delta_minutes * 60  # Convert delta from minutes to seconds
        self.sessions = {}
        self.load_dataset(filename)

    def load_dataset(self, filename):
        with open(filename, 'r') as csvfile:
            csv_reader = reader(csvfile)
            self.dataset = list(csv_reader)

    def create_sessions(self):
        for entry in self.dataset:
            if len(entry) < 5:
                continue  # Skip entries with insufficient elements
            ip, user_agent, timestamp, method, status_code = entry[0], entry[2], entry[1], entry[3], entry[4]
            timestamp = timestamp[1:-6]  # Extract timestamp without brackets
            try:
                timestamp = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S")
            except ValueError:
                continue  # Skip entries with invalid timestamps

            session_key = (ip, user_agent)
            if session_key not in self.sessions:
                self.sessions[session_key] = {'sessions': [], 'GET': 0, 'POST': 0, 'status_codes': {}}

            current_session = self.sessions[session_key]
            if not current_session['sessions'] or (timestamp - current_session['sessions'][-1][1]).seconds > self.delta:
                current_session['sessions'].append((timestamp, timestamp))
            else:
                current_session['sessions'][-1] = (current_session['sessions'][-1][0], timestamp)

            # Count GET and POST methods
            if method == 'GET':
                current_session['GET'] += 1
            elif method == 'POST':
                current_session['POST'] += 1

            # Cluster by status code
            if status_code in current_session['status_codes']:
                current_session['status_codes'][status_code] += 1
            else:
                current_session['status_codes'][status_code] = 1

    def print_sessions(self):
        session_id = 1
        print('-' * 130)
        print('| {:^20} | {:^20} | {:^20} | {:^20} | {:^20} | {:^20} |'.format("Session Id",
            "IP address", "User-Agent", "Start Time", "End Time", "GET Requests", "POST Requests"))
        print('-' * 130)
        for (ip, user_agent), data in self.sessions.items():
            for start_time, end_time in data['sessions']:
                print('| {:^20} | {:^20} | {:^20} | {:^20} | {:^20} | {:^20} |'.format(
                    session_id, ip, user_agent, start_time, end_time, data['GET'], data['POST']))
                session_id += 1
        print('-' * 130)

        print('\nStatus Code Clusters:')
        for (ip, user_agent), data in self.sessions.items():
            print(f'User: {ip}, User-Agent: {user_agent}')
            for status_code, count in data['status_codes'].items():
                print(f'Status Code: {status_code}, Count: {count}')

if __name__ == "__main__":
    filename = input('Enter the name of the dataset: ')
    delta = int(input('Enter delta value (minutes): '))

    web_analyzer = WebSessionAnalyzer(filename, delta)
    web_analyzer.create_sessions()
    web_analyzer.print_sessions()


Enter the name of the dataset: Q1 dataset.csv
Enter delta value (minutes): 30
----------------------------------------------------------------------------------------------------------------------------------
|      Session Id      |      IP address      |      User-Agent      |      Start Time      |       End Time       |     GET Requests     |
----------------------------------------------------------------------------------------------------------------------------------
|          1           |    172.20.112.25     | GET /airmedia/images/welcome.gif?GXHC_gx_session_id_airmedia=c7bf39189d98ea9fHTTP/1.0 | ^20 | ^20 |          0           |
|          2           |    172.20.112.25     | GET /airmedia/images/edit_off.gif?GXHC_gx_session_id_airmedia=c7bf39189d98ea9fHTTP/1.0 | ^20 | ^20 |          0           |
|          3           |    172.20.112.25     | GET /airmedia/images/pubcontent.gif?GXHC_gx_session_id_airmedia=c7bf39189d98ea9fHTTP/1.0 | ^20 | ^20 |          0           |
|  

Question 3:

In [12]:
from csv import reader
from datetime import datetime

class WebSessionAnalyzer:
    def __init__(self, filename, delta_minutes):
        self.delta = delta_minutes * 60  # Convert delta from minutes to seconds
        self.sessions_ip = {}
        self.sessions_agent = {}
        self.sessions_combined = {}
        self.load_dataset(filename)

    def load_dataset(self, filename):
        with open(filename, 'r') as csvfile:
            csv_reader = reader(csvfile)
            self.dataset = list(csv_reader)

    def create_sessions(self):
        for entry in self.dataset:
            if len(entry) < 5:
                continue  # Skip entries with insufficient elements
            ip, user_agent, timestamp, method, status_code = entry[0], entry[2], entry[1], entry[3], entry[4]
            timestamp = timestamp[1:-6]  # Extract timestamp without brackets
            try:
                timestamp = datetime.strptime(timestamp, "%d/%b/%Y:%H:%M:%S")
            except ValueError:
                continue  # Skip entries with invalid timestamps

            # IP-based sessionization
            session_key_ip = ip
            if session_key_ip not in self.sessions_ip:
                self.sessions_ip[session_key_ip] = {'sessions': [], 'GET': 0, 'POST': 0, 'status_codes': {}}

            current_session_ip = self.sessions_ip[session_key_ip]
            if not current_session_ip['sessions'] or (timestamp - current_session_ip['sessions'][-1][1]).seconds > self.delta:
                current_session_ip['sessions'].append((timestamp, timestamp))
            else:
                current_session_ip['sessions'][-1] = (current_session_ip['sessions'][-1][0], timestamp)

            # User Agent-based sessionization
            session_key_agent = user_agent
            if session_key_agent not in self.sessions_agent:
                self.sessions_agent[session_key_agent] = {'sessions': [], 'GET': 0, 'POST': 0, 'status_codes': {}}

            current_session_agent = self.sessions_agent[session_key_agent]
            if not current_session_agent['sessions'] or (timestamp - current_session_agent['sessions'][-1][1]).seconds > self.delta:
                current_session_agent['sessions'].append((timestamp, timestamp))
            else:
                current_session_agent['sessions'][-1] = (current_session_agent['sessions'][-1][0], timestamp)

            # IP Address + User Agent-based sessionization
            session_key_combined = (ip, user_agent)
            if session_key_combined not in self.sessions_combined:
                self.sessions_combined[session_key_combined] = {'sessions': [], 'GET': 0, 'POST': 0, 'status_codes': {}}

            current_session_combined = self.sessions_combined[session_key_combined]
            if not current_session_combined['sessions'] or (timestamp - current_session_combined['sessions'][-1][1]).seconds > self.delta:
                current_session_combined['sessions'].append((timestamp, timestamp))
            else:
                current_session_combined['sessions'][-1] = (current_session_combined['sessions'][-1][0], timestamp)

            # Count GET and POST methods for all sessionization methods
            if method == 'GET':
                current_session_ip['GET'] += 1
                current_session_agent['GET'] += 1
                current_session_combined['GET'] += 1
            elif method == 'POST':
                current_session_ip['POST'] += 1
                current_session_agent['POST'] += 1
                current_session_combined['POST'] += 1

            # Cluster by status code for all sessionization methods
            if status_code in current_session_ip['status_codes']:
                current_session_ip['status_codes'][status_code] += 1
            else:
                current_session_ip['status_codes'][status_code] = 1

            if status_code in current_session_agent['status_codes']:
                current_session_agent['status_codes'][status_code] += 1
            else:
                current_session_agent['status_codes'][status_code] = 1

            if status_code in current_session_combined['status_codes']:
                current_session_combined['status_codes'][status_code] += 1
            else:
                current_session_combined['status_codes'][status_code] = 1

    def print_sessions(self, sessionization_method):
        session_id = 1
        print('-' * 130)
        print('| {:^20} | {:^20} | {:^20} | {:^20} | {:^20} | {:^20} |'.format("Session Id",
            "IP address / User-Agent / Combined", "Start Time", "End Time", "GET Requests", "POST Requests"))
        print('-' * 130)
        if sessionization_method == "IP":
            sessions = self.sessions_ip
        elif sessionization_method == "User-Agent":
            sessions = self.sessions_agent
        elif sessionization_method == "Combined":
            sessions = self.sessions_combined
        else:
            print("Invalid sessionization method.")
            return

        for key, data in sessions.items():
            for start_time, end_time in data['sessions']:
                print('| {:^20} | {:^20} | {:^20} | {:^20} | {:^20} | {:^20} |'.format(
                    session_id, key, start_time, end_time, data['GET'], data['POST']))
                session_id += 1
        print('-' * 130)

        print(f'\nStatus Code Clusters ({sessionization_method} method):')
        for key, data in sessions.items():
            print(f'{sessionization_method}: {key}')
            for status_code, count in data['status_codes'].items():
                print(f'Status Code: {status_code}, Count: {count}')

if __name__ == "__main__":
    filename = input('Enter the name of the dataset: ')
    delta = int(input('Enter delta value (minutes): '))

    web_analyzer = WebSessionAnalyzer(filename, delta)
    web_analyzer.create_sessions()

    sessionization_method = input('Enter sessionization method (IP/User-Agent/Combined): ')
    web_analyzer.print_sessions(sessionization_method)


Enter the name of the dataset: Q2 dataset.csv
Enter delta value (minutes): 30
Enter sessionization method (IP/User-Agent/Combined): Combined
----------------------------------------------------------------------------------------------------------------------------------
|      Session Id      | IP address / User-Agent / Combined |      Start Time      |       End Time       |     GET Requests     |    POST Requests     |
----------------------------------------------------------------------------------------------------------------------------------
----------------------------------------------------------------------------------------------------------------------------------

Status Code Clusters (Combined method):
