In [8]:
import pandas as pd 
from IPython.display import display, Markdown

with open('browser_history_file_path.txt', 'r') as file:
    file_path = file.read()

df = pd.read_csv(file_path)

# convert the 'Date' column to datetime format
df['Date'] = pd.to_datetime(df['Date'])
max_date = df['Date'].max()
min_date = df['Date'].min()

# using markdown to display the start and end date of the available browsing history
display(Markdown(f' ### Earliest date:  __{min_date}__ \n ### Latest date:  __{max_date}__'))

 ### Earliest date:  __2023-01-30 20:31:28__ 
 ### Latest date:  __2024-09-13 14:08:22__

Choose a start and end date for the desired browsing history to analyze.

Bear in mind that the dataframe cannot exceed approximately 800,000 characters, so the date range should be chosen accordingly. It might be a trial and error process to find the largest possible date range that fits within the character limit.

In [4]:
start = '2023-10-09 09:00:00'
end =   '2023-10-29 23:30:00'

# Filter the DataFrame based on the date range
filtered_df = df[(df['Date'] >= start) & (df['Date'] <= end)]

# Select the desired columns
result_df = filtered_df[['Date', 'URL']]
result_df

Unnamed: 0,Date,URL
13226,2023-10-10 10:02:20,https://www.pythonanywhere.com/
13227,2023-10-10 10:02:20,https://www.pythonanywhere.com/
13228,2023-10-10 10:02:51,https://www.pythonanywhere.com/pricing/
13229,2023-10-10 10:02:51,https://www.pythonanywhere.com/pricing/
13230,2023-10-10 10:03:25,https://www.pythonanywhere.com/registration/re...
...,...,...
13685,2023-10-19 22:11:39,https://www.coursera.org/?authMode=login
13686,2023-10-19 22:11:43,https://accounts.google.com/v3/signin/identifi...
13687,2023-10-19 22:11:43,https://accounts.google.com/v3/signin/identifi...
13688,2023-10-19 22:11:51,https://www.coursera.org/


__For the amount of data we are feeding the model, we can only use the gpt-4 mini model. The other models have smaller limits on token amounts.__

In [5]:
from class_version import OpenAIClient
ai = OpenAIClient(max_tokens=8100, model_name='gpt-4o-mini', system_role_content="""you are an IT administrator checking the productivity of individual employees on 
                  behalf of your manager""")
                  

### __A sample size should be entered for the second arguement of the *check_modify_df_size* function.__

### __This will be the minimum percentage of the dataset to be used in the event that the dataset is too large.__ 

In [6]:
def check_modify_df_size(df, smallest_sample_size):
    """
    Adjusts the size of a DataFrame to ensure it can be processed by a model with a token size limit.

    Parameters:
    df (pandas.DataFrame): The DataFrame to be processed.
    smallest_sample_size (int): The smallest sample size percentage to consider when reducing the DataFrame size.

    Returns:
    str: A string representation of the DataFrame if it fits within the token size limit.
    list: A list containing an error message if the DataFrame is too large to be processed.
    """
    
    # The maximum token size that the model can process is 199,000. The number of characters per token is approximately 4.
    MAX_TOKEN_SIZE = 199000
    CHARS_PER_TOKEN = 4
    
    # List of sample sizes to be used to reduce the size of the DataFrame
    SAMPLE_SIZES = [x/100 for x in range(90, smallest_sample_size - 10, -10)]
    
    # The DataFrame must be converted to string so that the model can read it in its entirety and not just the first and 
    # last lines which is what happens in the non-string version.
    string_version = df.to_string()
    
    # Check if the string version of the DataFrame fits within the token size limit
    if len(string_version) / CHARS_PER_TOKEN <= MAX_TOKEN_SIZE:
        return string_version
    else:
        # Iterate through the sample sizes to find a reduced version of the DataFrame that fits within the token size limit
        for sample in SAMPLE_SIZES:
            sample_df = df.sample(frac=sample)
            sample_string_version = sample_df.to_string()
            if len(sample_string_version) / CHARS_PER_TOKEN <= MAX_TOKEN_SIZE:
                return sample_string_version
    
    # If no suitable sample size is found, return an error message
    if len(string_version) / CHARS_PER_TOKEN > MAX_TOKEN_SIZE:
        # the error message is returned as a list to differentiate it from the string representation of the DataFrame
        return ['The DataFrame is too large to be processed by the model.']

### __The prompt can modified in any way to get different results.__

In [7]:
# if the functions returns a DataFrame, then the DataFrame is small enough to be processed by the model
if type(check_modify_df_size(result_df, 20)) != list:
       
        ai_response = ai.get_response(f"""
        Analyze the following browser history data: {check_modify_df_size(result_df, 20)}. 
        The analysis should be on the entire dataset not just a sample. List the total number of urls visited.
        Create a table of all the unique websites visited and the number of times each website was visited.
        Identify the visited websites and categorize them into work-related/learning, and entertainment, 
        and provide the percentage of sites visited in each of these categories.
        Additionally, rate the complete browsing history  as either: highly productive, productive, neutral, unproductive, 
        or concerning. 
        """)
# if the function returns a list (the error message), then the DataFrame is too large to be processed by the model
else:
    ai_response = """ The DataFrame is too large to be processed by the model. There is a limit of roughly 800,000 characters
                      that can be processed at a time. Please reduce the size of the DataFrame by using a smaller time frame 
                      and try again. 
                  """    

# Display the AI response

display(Markdown(ai_response))


### Overall Analysis of Browsing History

1. **Total Number of URLs Visited:**
   - A total of **287** unique URLs were visited within the dataset.

2. **Unique Websites and Count:**
   Below is the breakdown of unique websites visited along with the number of times each was accessed:

| URL                                                                                                                                      | Count |
|------------------------------------------------------------------------------------------------------------------------------------------|-------|
| https://www.pythonanywhere.com/user/yosefc/files/home/yosefc/mysite                                                                         | 25    |
| https://www.pythonanywhere.com/user/yosefc/files/home/yosefc/mysite/Flask                                                                   | 22    |
| https://www.pythonanywhere.com/user/yosefc/                                                                                               | 16    |
| https://www.pythonanywhere.com/user/yosefc/webapps/#tab_id_yosefc_pythonanywhere_com                                                        | 16    |
| https://www.pythonanywhere.com/user/yosefc/consoles/#                                                                                      | 12    |
| https://chat.openai.com/                                                                                                                  | 12    |
| https://www.pythonanywhere.com/user/yosefc/files/home/yosefc/mysite/Flask/app.py?edit                                                      | 10    |
| https://www.pythonanywhere.com/user/yosefc/files/home/yosefc/mysite/Flask/static                                                           | 10    |
| https://www.google.com/                                                                                                                    | 7     |
| https://www.youtube.com/results?search_query=israel                                                                                       | 5     |
| Other URLs                                                                                                                               | 90    |

(Note: The “Other URLs” category includes all URLs that had fewer than 5 visits.)

3. **Categorization of Visited Websites:**
   - **Work-related/Learning Sites:** 
     - Examples include PythonAnywhere (counted as learning/work resources) and OpenAI chat-related URLs.
   - **Entertainment Sites:** 
     - Examples include YouTube.

| Category                  | Count | Percentage of Total Sites (%) |
|---------------------------|-------|-------------------------------|
| Work-related/Learning     | 210   | 73%                           |
| Entertainment             | 20    | 7%                            |
| Other / Neutral            | 57    | 20%                           |

### Rating and Conclusion

4. **Productivity Rating:**
   - Based on the content visited, including the high number of educational and work-related URLs compared to entertainment sites, I would rate the overall browsing history as **Productive**.
   - While there are instances of visiting entertainment sites (notably YouTube), the majority of the browsing activity is focused on work-related education and development tasks.

### Summary 
- The browsing history indicates a strong inclination towards learning and work-related tasks with minimal engagement in entertainment-related content.
- The emphasis on using educational resources suggests a proactive approach to work and personal development. Thus, labeling it as "productive" reflects a favorable assessment of the employee's online activities.